feat: initialize Kurdistan SDK - independent fork of Polkadot SDK

This commit is contained in:
2025-12-13 15:44:15 +03:00
commit 286de54384
6841 changed files with 1848356 additions and 0 deletions
+1
View File
@@ -0,0 +1 @@
This folder holds all networking subsystem implementations, each with their own crate.
@@ -0,0 +1,56 @@
[package]
name = "pezkuwi-approval-distribution"
version = "7.0.0"
description = "Pezkuwi Approval Distribution subsystem for the distribution of assignments and approvals for approval checks on candidates over the network."
authors.workspace = true
edition.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
[lints]
workspace = true
[dependencies]
itertools = { workspace = true }
pezkuwi-node-metrics = { workspace = true, default-features = true }
pezkuwi-node-network-protocol = { workspace = true, default-features = true }
pezkuwi-node-primitives = { workspace = true, default-features = true }
pezkuwi-node-subsystem = { workspace = true, default-features = true }
pezkuwi-node-subsystem-util = { workspace = true, default-features = true }
pezkuwi-primitives = { workspace = true, default-features = true }
rand = { workspace = true, default-features = true }
futures = { workspace = true }
futures-timer = { workspace = true }
gum = { workspace = true, default-features = true }
[dev-dependencies]
sc-keystore = { workspace = true }
sp-application-crypto = { workspace = true, default-features = true }
sp-authority-discovery = { workspace = true, default-features = true }
sp-core = { features = ["std"], workspace = true, default-features = true }
pezkuwi-node-subsystem-test-helpers = { workspace = true }
pezkuwi-primitives-test-helpers = { workspace = true }
assert_matches = { workspace = true }
rand_chacha = { workspace = true, default-features = true }
schnorrkel = { workspace = true }
# rand_core should match schnorrkel
rand_core = { workspace = true }
sp-tracing = { workspace = true }
[features]
runtime-benchmarks = [
"gum/runtime-benchmarks",
"pezkuwi-node-metrics/runtime-benchmarks",
"pezkuwi-node-network-protocol/runtime-benchmarks",
"pezkuwi-node-primitives/runtime-benchmarks",
"pezkuwi-node-subsystem-test-helpers/runtime-benchmarks",
"pezkuwi-node-subsystem-util/runtime-benchmarks",
"pezkuwi-node-subsystem/runtime-benchmarks",
"pezkuwi-primitives-test-helpers/runtime-benchmarks",
"pezkuwi-primitives/runtime-benchmarks",
"sp-authority-discovery/runtime-benchmarks",
]
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,272 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use pezkuwi_node_metrics::metrics::{prometheus, Metrics as MetricsTrait};
use pezkuwi_node_primitives::approval::v2::AssignmentCertKindV2;
/// Approval Distribution metrics.
#[derive(Default, Clone)]
pub struct Metrics(Option<MetricsInner>);
#[derive(Clone)]
struct MetricsInner {
assignments_imported_total: prometheus::CounterVec<prometheus::U64>,
approvals_imported_total: prometheus::Counter<prometheus::U64>,
unified_with_peer_total: prometheus::Counter<prometheus::U64>,
aggression_l1_messages_total: prometheus::Counter<prometheus::U64>,
aggression_l2_messages_total: prometheus::Counter<prometheus::U64>,
time_unify_with_peer: prometheus::Histogram,
time_import_pending_now_known: prometheus::Histogram,
assignments_received_result: prometheus::CounterVec<prometheus::U64>,
approvals_received_result: prometheus::CounterVec<prometheus::U64>,
}
trait AsLabel {
fn as_label(&self) -> &str;
}
impl AsLabel for &AssignmentCertKindV2 {
fn as_label(&self) -> &str {
match self {
AssignmentCertKindV2::RelayVRFDelay { .. } => "VRF Delay",
AssignmentCertKindV2::RelayVRFModulo { .. } => "VRF Modulo",
AssignmentCertKindV2::RelayVRFModuloCompact { .. } => "VRF Modulo Compact",
}
}
}
impl Metrics {
pub(crate) fn on_assignment_imported(&self, kind: &AssignmentCertKindV2) {
if let Some(metrics) = &self.0 {
metrics.assignments_imported_total.with_label_values(&[kind.as_label()]).inc();
}
}
pub(crate) fn on_approval_imported(&self) {
if let Some(metrics) = &self.0 {
metrics.approvals_imported_total.inc();
}
}
pub(crate) fn on_unify_with_peer(&self) {
if let Some(metrics) = &self.0 {
metrics.unified_with_peer_total.inc();
}
}
pub(crate) fn time_unify_with_peer(&self) -> Option<prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.time_unify_with_peer.start_timer())
}
pub(crate) fn time_import_pending_now_known(
&self,
) -> Option<prometheus::prometheus::HistogramTimer> {
self.0
.as_ref()
.map(|metrics| metrics.time_import_pending_now_known.start_timer())
}
pub(crate) fn on_approval_recent_outdated(&self) {
if let Some(metrics) = &self.0 {
metrics.approvals_received_result.with_label_values(&["outdated"]).inc()
}
}
pub(crate) fn on_approval_invalid_block(&self) {
if let Some(metrics) = &self.0 {
metrics.approvals_received_result.with_label_values(&["invalidblock"]).inc()
}
}
pub(crate) fn on_approval_unknown_assignment(&self) {
if let Some(metrics) = &self.0 {
metrics
.approvals_received_result
.with_label_values(&["unknownassignment"])
.inc()
}
}
pub(crate) fn on_approval_duplicate(&self) {
if let Some(metrics) = &self.0 {
metrics.approvals_received_result.with_label_values(&["duplicate"]).inc()
}
}
pub(crate) fn on_approval_out_of_view(&self) {
if let Some(metrics) = &self.0 {
metrics.approvals_received_result.with_label_values(&["outofview"]).inc()
}
}
pub(crate) fn on_approval_good_known(&self) {
if let Some(metrics) = &self.0 {
metrics.approvals_received_result.with_label_values(&["goodknown"]).inc()
}
}
pub(crate) fn on_approval_bad(&self) {
if let Some(metrics) = &self.0 {
metrics.approvals_received_result.with_label_values(&["bad"]).inc()
}
}
pub(crate) fn on_approval_bug(&self) {
if let Some(metrics) = &self.0 {
metrics.approvals_received_result.with_label_values(&["bug"]).inc()
}
}
pub(crate) fn on_assignment_recent_outdated(&self) {
if let Some(metrics) = &self.0 {
metrics.assignments_received_result.with_label_values(&["outdated"]).inc()
}
}
pub(crate) fn on_assignment_invalid_block(&self) {
if let Some(metrics) = &self.0 {
metrics.assignments_received_result.with_label_values(&["invalidblock"]).inc()
}
}
pub(crate) fn on_assignment_duplicate(&self) {
if let Some(metrics) = &self.0 {
metrics.assignments_received_result.with_label_values(&["duplicate"]).inc()
}
}
pub(crate) fn on_assignment_out_of_view(&self) {
if let Some(metrics) = &self.0 {
metrics.assignments_received_result.with_label_values(&["outofview"]).inc()
}
}
pub(crate) fn on_assignment_good_known(&self) {
if let Some(metrics) = &self.0 {
metrics.assignments_received_result.with_label_values(&["goodknown"]).inc()
}
}
pub(crate) fn on_assignment_bad(&self) {
if let Some(metrics) = &self.0 {
metrics.assignments_received_result.with_label_values(&["bad"]).inc()
}
}
pub(crate) fn on_assignment_far(&self) {
if let Some(metrics) = &self.0 {
metrics.assignments_received_result.with_label_values(&["far"]).inc()
}
}
pub(crate) fn on_aggression_l1(&self) {
if let Some(metrics) = &self.0 {
metrics.aggression_l1_messages_total.inc();
}
}
pub(crate) fn on_aggression_l2(&self) {
if let Some(metrics) = &self.0 {
metrics.aggression_l2_messages_total.inc();
}
}
}
impl MetricsTrait for Metrics {
fn try_register(registry: &prometheus::Registry) -> Result<Self, prometheus::PrometheusError> {
let metrics = MetricsInner {
assignments_imported_total: prometheus::register(
prometheus::CounterVec::new(
prometheus::Opts::new(
"pezkuwi_teyrchain_assignments_imported_total",
"Number of valid assignments imported locally or from other peers.",
),
&["kind"],
)?,
registry,
)?,
approvals_imported_total: prometheus::register(
prometheus::Counter::new(
"pezkuwi_teyrchain_approvals_imported_total",
"Number of valid approvals imported locally or from other peers.",
)?,
registry,
)?,
unified_with_peer_total: prometheus::register(
prometheus::Counter::new(
"pezkuwi_teyrchain_unified_with_peer_total",
"Number of times `unify_with_peer` is called.",
)?,
registry,
)?,
aggression_l1_messages_total: prometheus::register(
prometheus::Counter::new(
"pezkuwi_teyrchain_approval_distribution_aggression_l1_messages_total",
"Number of messages in approval distribution for which aggression L1 has been triggered",
)?,
registry,
)?,
aggression_l2_messages_total: prometheus::register(
prometheus::Counter::new(
"pezkuwi_teyrchain_approval_distribution_aggression_l2_messages_total",
"Number of messages in approval distribution for which aggression L2 has been triggered",
)?,
registry,
)?,
time_unify_with_peer: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"pezkuwi_teyrchain_time_unify_with_peer",
"Time spent within fn `unify_with_peer`.",
)
.buckets(vec![
0.000625, 0.00125, 0.0025, 0.005, 0.0075, 0.01, 0.025, 0.05, 0.1, 0.25,
0.5, 1.0, 2.5, 5.0, 10.0,
]),
)?,
registry,
)?,
time_import_pending_now_known: prometheus::register(
prometheus::Histogram::with_opts(prometheus::HistogramOpts::new(
"pezkuwi_teyrchain_time_import_pending_now_known",
"Time spent on importing pending assignments and approvals.",
).buckets(vec![0.0001, 0.0004, 0.0016, 0.0064, 0.0256, 0.1024, 0.4096, 1.6384, 3.2768, 4.9152, 6.5536,]))?,
registry,
)?,
assignments_received_result: prometheus::register(
prometheus::CounterVec::new(
prometheus::Opts::new(
"pezkuwi_teyrchain_assignments_received_result",
"Result of a processed assignment",
),
&["status"]
)?,
registry,
)?,
approvals_received_result: prometheus::register(
prometheus::CounterVec::new(
prometheus::Opts::new(
"pezkuwi_teyrchain_approvals_received_result",
"Result of a processed approval",
),
&["status"]
)?,
registry,
)?,
};
Ok(Metrics(Some(metrics)))
}
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,63 @@
[package]
name = "pezkuwi-availability-distribution"
description = "The Availability Distribution subsystem. Requests the required availability data. Also distributes availability data and chunks to requesters."
version = "7.0.0"
authors.workspace = true
edition.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
[lints]
workspace = true
[[bench]]
name = "availability-distribution-regression-bench"
path = "benches/availability-distribution-regression-bench.rs"
harness = false
required-features = ["subsystem-benchmarks"]
[dependencies]
codec = { features = ["std"], workspace = true, default-features = true }
fatality = { workspace = true }
futures = { workspace = true }
gum = { workspace = true, default-features = true }
pezkuwi-erasure-coding = { workspace = true, default-features = true }
pezkuwi-node-network-protocol = { workspace = true, default-features = true }
pezkuwi-node-primitives = { workspace = true, default-features = true }
pezkuwi-node-subsystem = { workspace = true, default-features = true }
pezkuwi-node-subsystem-util = { workspace = true, default-features = true }
pezkuwi-primitives = { workspace = true, default-features = true }
rand = { workspace = true, default-features = true }
sc-network = { workspace = true, default-features = true }
schnellru = { workspace = true }
sp-core = { features = ["std"], workspace = true, default-features = true }
sp-keystore = { workspace = true, default-features = true }
thiserror = { workspace = true }
[dev-dependencies]
assert_matches = { workspace = true }
futures-timer = { workspace = true }
pezkuwi-node-subsystem-test-helpers = { workspace = true }
pezkuwi-primitives-test-helpers = { workspace = true }
pezkuwi-subsystem-bench = { workspace = true }
rstest = { workspace = true }
sp-keyring = { workspace = true, default-features = true }
sp-tracing = { workspace = true, default-features = true }
[features]
subsystem-benchmarks = []
runtime-benchmarks = [
"gum/runtime-benchmarks",
"pezkuwi-erasure-coding/runtime-benchmarks",
"pezkuwi-node-network-protocol/runtime-benchmarks",
"pezkuwi-node-primitives/runtime-benchmarks",
"pezkuwi-node-subsystem-test-helpers/runtime-benchmarks",
"pezkuwi-node-subsystem-util/runtime-benchmarks",
"pezkuwi-node-subsystem/runtime-benchmarks",
"pezkuwi-primitives-test-helpers/runtime-benchmarks",
"pezkuwi-primitives/runtime-benchmarks",
"pezkuwi-subsystem-bench/runtime-benchmarks",
"sc-network/runtime-benchmarks",
"sp-keyring/runtime-benchmarks",
]
@@ -0,0 +1,87 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//! availability-read regression tests
//!
//! Availability read benchmark based on Kusama parameters and scale.
//!
//! Subsystems involved:
//! - availability-distribution
//! - bitfield-distribution
//! - availability-store
use pezkuwi_subsystem_bench::{
availability::{benchmark_availability_write, prepare_test, TestState},
configuration::TestConfiguration,
usage::BenchmarkUsage,
utils::save_to_file,
};
use std::io::Write;
const BENCH_COUNT: usize = 50;
fn main() -> Result<(), String> {
let mut messages = vec![];
let mut config = TestConfiguration::default();
// A single node effort roughly
config.n_cores = 10;
config.n_validators = 500;
config.num_blocks = 3;
config.generate_pov_sizes();
let state = TestState::new(&config);
println!("Benchmarking...");
let usages: Vec<BenchmarkUsage> = (0..BENCH_COUNT)
.map(|n| {
print!("\r[{}{}]", "#".repeat(n), "_".repeat(BENCH_COUNT - n));
std::io::stdout().flush().unwrap();
let (mut env, _cfgs) = prepare_test(
&state,
pezkuwi_subsystem_bench::availability::TestDataAvailability::Write,
false,
);
env.runtime().block_on(benchmark_availability_write(&mut env, &state))
})
.collect();
println!("\rDone!{}", " ".repeat(BENCH_COUNT));
let average_usage = BenchmarkUsage::average(&usages);
save_to_file(
"charts/availability-distribution-regression-bench.json",
average_usage.to_chart_json().map_err(|e| e.to_string())?,
)
.map_err(|e| e.to_string())?;
println!("{}", average_usage);
// We expect no variance for received and sent
// but use 0.001 because we operate with floats
messages.extend(average_usage.check_network_usage(&[
("Received from peers", 433.3333, 0.001),
("Sent to peers", 18479.9000, 0.001),
]));
messages.extend(average_usage.check_cpu_usage(&[
("availability-distribution", 0.0131, 0.1),
("availability-store", 0.1576, 0.1),
("bitfield-distribution", 0.0224, 0.1),
]));
if messages.is_empty() {
Ok(())
} else {
eprintln!("{}", messages.join("\n"));
Err("Regressions found".to_string())
}
}
@@ -0,0 +1,126 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//
//! Error handling related code and Error/Result definitions.
use fatality::Nested;
use pezkuwi_node_network_protocol::request_response::outgoing::RequestError;
use pezkuwi_primitives::SessionIndex;
use futures::channel::oneshot;
use pezkuwi_node_subsystem::{ChainApiError, RuntimeApiError, SubsystemError};
use pezkuwi_node_subsystem_util::runtime;
use crate::LOG_TARGET;
#[allow(missing_docs)]
#[fatality::fatality(splitable)]
pub enum Error {
#[fatal]
#[error("Spawning subsystem task failed: {0}")]
SpawnTask(#[source] SubsystemError),
#[fatal]
#[error("Erasure chunk requester stream exhausted")]
RequesterExhausted,
#[fatal]
#[error("Receive channel closed: {0}")]
IncomingMessageChannel(#[source] SubsystemError),
#[fatal(forward)]
#[error("Error while accessing runtime information: {0}")]
Runtime(#[from] runtime::Error),
#[fatal]
#[error("Oneshot for receiving response from Chain API got cancelled")]
ChainApiSenderDropped(#[from] oneshot::Canceled),
#[fatal]
#[error("Retrieving response from Chain API unexpectedly failed with error: {0}")]
ChainApi(#[from] ChainApiError),
#[error("Failed to get node features from the runtime")]
FailedNodeFeatures(#[source] RuntimeApiError),
// av-store will drop the sender on any error that happens.
#[error("Response channel to obtain chunk failed")]
QueryChunkResponseChannel(#[source] oneshot::Canceled),
// av-store will drop the sender on any error that happens.
#[error("Response channel to obtain available data failed")]
QueryAvailableDataResponseChannel(#[source] oneshot::Canceled),
// We tried accessing a session that was not cached.
#[error("Session {missing_session} is not cached, cached sessions: {available_sessions:?}.")]
NoSuchCachedSession { available_sessions: Vec<SessionIndex>, missing_session: SessionIndex },
// Sending request response failed (Can happen on timeouts for example).
#[error("Sending a request's response failed.")]
SendResponse,
#[error("FetchPoV request error: {0}")]
FetchPoV(#[source] RequestError),
#[error("Fetched PoV does not match expected hash")]
UnexpectedPoV,
#[error("Remote responded with `NoSuchPoV`")]
NoSuchPoV,
#[error("Given validator index could not be found in current session")]
InvalidValidatorIndex,
#[error("Erasure coding error: {0}")]
ErasureCoding(#[from] pezkuwi_erasure_coding::Error),
}
/// General result abbreviation type alias.
pub type Result<T> = std::result::Result<T, Error>;
/// Utility for eating top level errors and log them.
///
/// We basically always want to try and continue on error. This utility function is meant to
/// consume top-level errors by simply logging them
pub fn log_error(
result: Result<()>,
ctx: &'static str,
warn_freq: &mut gum::Freq,
) -> std::result::Result<(), FatalError> {
match result.into_nested()? {
Ok(()) => Ok(()),
Err(jfyi) => {
match jfyi {
JfyiError::UnexpectedPoV |
JfyiError::InvalidValidatorIndex |
JfyiError::NoSuchCachedSession { .. } |
JfyiError::QueryAvailableDataResponseChannel(_) |
JfyiError::QueryChunkResponseChannel(_) |
JfyiError::FailedNodeFeatures(_) |
JfyiError::ErasureCoding(_) => gum::warn!(target: LOG_TARGET, error = %jfyi, ctx),
JfyiError::FetchPoV(_) |
JfyiError::SendResponse |
JfyiError::NoSuchPoV |
JfyiError::Runtime(_) => {
gum::warn_if_frequent!(freq: warn_freq, max_rate: gum::Times::PerHour(100), target: LOG_TARGET, error = ?jfyi, ctx)
},
}
Ok(())
},
}
}
@@ -0,0 +1,199 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use futures::{future::Either, FutureExt, StreamExt, TryFutureExt};
use sp_keystore::KeystorePtr;
use pezkuwi_node_network_protocol::request_response::{
v1, v2, IncomingRequestReceiver, ReqProtocolNames,
};
use pezkuwi_node_subsystem::{
messages::AvailabilityDistributionMessage, overseer, FromOrchestra, OverseerSignal,
SpawnedSubsystem, SubsystemError,
};
/// Error and [`Result`] type for this subsystem.
mod error;
use error::{log_error, FatalError, Result};
use pezkuwi_node_subsystem_util::runtime::RuntimeInfo;
/// `Requester` taking care of requesting chunks for candidates pending availability.
mod requester;
use requester::Requester;
/// Handing requests for PoVs during backing.
mod pov_requester;
/// Responding to erasure chunk requests:
mod responder;
use responder::{run_chunk_receivers, run_pov_receiver};
mod metrics;
/// Prometheus `Metrics` for availability distribution.
pub use metrics::Metrics;
#[cfg(test)]
mod tests;
const LOG_TARGET: &'static str = "teyrchain::availability-distribution";
/// The availability distribution subsystem.
pub struct AvailabilityDistributionSubsystem {
/// Easy and efficient runtime access for this subsystem.
runtime: RuntimeInfo,
/// Receivers to receive messages from.
recvs: IncomingRequestReceivers,
/// Mapping of the req-response protocols to the full protocol names.
req_protocol_names: ReqProtocolNames,
/// Prometheus metrics.
metrics: Metrics,
}
/// Receivers to be passed into availability distribution.
pub struct IncomingRequestReceivers {
/// Receiver for incoming PoV requests.
pub pov_req_receiver: IncomingRequestReceiver<v1::PoVFetchingRequest>,
/// Receiver for incoming v1 availability chunk requests.
pub chunk_req_v1_receiver: IncomingRequestReceiver<v1::ChunkFetchingRequest>,
/// Receiver for incoming v2 availability chunk requests.
pub chunk_req_v2_receiver: IncomingRequestReceiver<v2::ChunkFetchingRequest>,
}
#[overseer::subsystem(AvailabilityDistribution, error=SubsystemError, prefix=self::overseer)]
impl<Context> AvailabilityDistributionSubsystem {
fn start(self, ctx: Context) -> SpawnedSubsystem {
let future = self
.run(ctx)
.map_err(|e| SubsystemError::with_origin("availability-distribution", e))
.boxed();
SpawnedSubsystem { name: "availability-distribution-subsystem", future }
}
}
#[overseer::contextbounds(AvailabilityDistribution, prefix = self::overseer)]
impl AvailabilityDistributionSubsystem {
/// Create a new instance of the availability distribution.
pub fn new(
keystore: KeystorePtr,
recvs: IncomingRequestReceivers,
req_protocol_names: ReqProtocolNames,
metrics: Metrics,
) -> Self {
let runtime = RuntimeInfo::new(Some(keystore));
Self { runtime, recvs, req_protocol_names, metrics }
}
/// Start processing work as passed on from the Overseer.
async fn run<Context>(self, mut ctx: Context) -> std::result::Result<(), FatalError> {
let Self { mut runtime, recvs, metrics, req_protocol_names } = self;
let IncomingRequestReceivers {
pov_req_receiver,
chunk_req_v1_receiver,
chunk_req_v2_receiver,
} = recvs;
let mut requester = Requester::new(req_protocol_names, metrics.clone()).fuse();
let mut warn_freq = gum::Freq::new();
{
let sender = ctx.sender().clone();
ctx.spawn(
"pov-receiver",
run_pov_receiver(sender.clone(), pov_req_receiver, metrics.clone()).boxed(),
)
.map_err(FatalError::SpawnTask)?;
ctx.spawn(
"chunk-receiver",
run_chunk_receivers(
sender,
chunk_req_v1_receiver,
chunk_req_v2_receiver,
metrics.clone(),
)
.boxed(),
)
.map_err(FatalError::SpawnTask)?;
}
loop {
let action = {
let mut subsystem_next = ctx.recv().fuse();
futures::select! {
subsystem_msg = subsystem_next => Either::Left(subsystem_msg),
from_task = requester.next() => Either::Right(from_task),
}
};
// Handle task messages sending:
let message = match action {
Either::Left(subsystem_msg) =>
subsystem_msg.map_err(|e| FatalError::IncomingMessageChannel(e))?,
Either::Right(from_task) => {
let from_task = from_task.ok_or(FatalError::RequesterExhausted)?;
ctx.send_message(from_task).await;
continue;
},
};
match message {
FromOrchestra::Signal(OverseerSignal::ActiveLeaves(update)) => {
log_error(
requester
.get_mut()
.update_fetching_heads(&mut ctx, &mut runtime, update)
.await,
"Error in Requester::update_fetching_heads",
&mut warn_freq,
)?;
},
FromOrchestra::Signal(OverseerSignal::BlockFinalized(_hash, _finalized_number)) => {
},
FromOrchestra::Signal(OverseerSignal::Conclude) => return Ok(()),
FromOrchestra::Communication {
msg:
AvailabilityDistributionMessage::FetchPoV {
relay_parent,
from_validator,
para_id,
candidate_hash,
pov_hash,
tx,
},
} => {
log_error(
pov_requester::fetch_pov(
&mut ctx,
&mut runtime,
relay_parent,
from_validator,
para_id,
candidate_hash,
pov_hash,
tx,
metrics.clone(),
)
.await,
"pov_requester::fetch_pov",
&mut warn_freq,
)?;
},
}
}
}
}
@@ -0,0 +1,156 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use pezkuwi_node_subsystem_util::{
metrics,
metrics::{
prometheus,
prometheus::{Counter, CounterVec, Opts, PrometheusError, Registry, U64},
},
};
/// Label for success counters.
pub const SUCCEEDED: &'static str = "succeeded";
/// Label for fail counters.
pub const FAILED: &'static str = "failed";
/// Label for chunks/PoVs that could not be served, because they were not available.
pub const NOT_FOUND: &'static str = "not-found";
/// Availability Distribution metrics.
#[derive(Clone, Default)]
pub struct Metrics(Option<MetricsInner>);
#[derive(Clone)]
struct MetricsInner {
/// Number of chunks fetched.
///
/// Note: The failed count gets incremented, when we were not able to fetch the chunk at all.
/// For times, where we failed downloading, but succeeded on the next try (with different
/// backers), see `retries`.
fetched_chunks: CounterVec<U64>,
/// Number of chunks served.
served_chunks: CounterVec<U64>,
/// Number of received fetch PoV responses.
fetched_povs: CounterVec<U64>,
/// Number of PoVs served.
served_povs: CounterVec<U64>,
/// Number of times our first set of validators did not provide the needed chunk and we had to
/// query further validators.
retries: Counter<U64>,
}
impl Metrics {
/// Create new dummy metrics, not reporting anything.
pub fn new_dummy() -> Self {
Metrics(None)
}
/// Increment counter on fetched labels.
pub fn on_fetch(&self, label: &'static str) {
if let Some(metrics) = &self.0 {
metrics.fetched_chunks.with_label_values(&[label]).inc()
}
}
/// Increment counter on served chunks.
pub fn on_served_chunk(&self, label: &'static str) {
if let Some(metrics) = &self.0 {
metrics.served_chunks.with_label_values(&[label]).inc()
}
}
/// Increment counter on fetched PoVs.
pub fn on_fetched_pov(&self, label: &'static str) {
if let Some(metrics) = &self.0 {
metrics.fetched_povs.with_label_values(&[label]).inc()
}
}
/// Increment counter on served PoVs.
pub fn on_served_pov(&self, label: &'static str) {
if let Some(metrics) = &self.0 {
metrics.served_povs.with_label_values(&[label]).inc()
}
}
/// Increment retry counter.
pub fn on_retry(&self) {
if let Some(metrics) = &self.0 {
metrics.retries.inc()
}
}
}
impl metrics::Metrics for Metrics {
fn try_register(registry: &Registry) -> Result<Self, PrometheusError> {
let metrics = MetricsInner {
fetched_chunks: prometheus::register(
CounterVec::new(
Opts::new(
"pezkuwi_teyrchain_fetched_chunks_total",
"Total number of fetched chunks.",
),
&["success"]
)?,
registry,
)?,
served_chunks: prometheus::register(
CounterVec::new(
Opts::new(
"pezkuwi_teyrchain_served_chunks_total",
"Total number of chunks served by this backer.",
),
&["success"]
)?,
registry,
)?,
fetched_povs: prometheus::register(
CounterVec::new(
Opts::new(
"pezkuwi_teyrchain_fetched_povs_total",
"Total number of povs fetches by this backer.",
),
&["success"]
)?,
registry,
)?,
served_povs: prometheus::register(
CounterVec::new(
Opts::new(
"pezkuwi_teyrchain_served_povs_total",
"Total number of povs served by this backer.",
),
&["success"]
)?,
registry,
)?,
retries: prometheus::register(
Counter::new(
"pezkuwi_teyrchain_fetch_retries_total",
"Number of times we did not succeed in fetching a chunk and needed to try more backers.",
)?,
registry,
)?,
};
Ok(Metrics(Some(metrics)))
}
}
@@ -0,0 +1,239 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//! PoV requester takes care of requesting PoVs from validators of a backing group.
use futures::{channel::oneshot, future::BoxFuture, FutureExt};
use pezkuwi_node_network_protocol::request_response::{
outgoing::{RequestError, Requests},
v1::{PoVFetchingRequest, PoVFetchingResponse},
OutgoingRequest, Recipient,
};
use pezkuwi_node_primitives::PoV;
use pezkuwi_node_subsystem::{
messages::{IfDisconnected, NetworkBridgeTxMessage},
overseer,
};
use pezkuwi_node_subsystem_util::runtime::RuntimeInfo;
use pezkuwi_primitives::{AuthorityDiscoveryId, CandidateHash, Hash, Id as ParaId, ValidatorIndex};
use crate::{
error::{Error, FatalError, JfyiError, Result},
metrics::{FAILED, NOT_FOUND, SUCCEEDED},
Metrics, LOG_TARGET,
};
/// Start background worker for taking care of fetching the requested `PoV` from the network.
#[overseer::contextbounds(AvailabilityDistribution, prefix = self::overseer)]
pub async fn fetch_pov<Context>(
ctx: &mut Context,
runtime: &mut RuntimeInfo,
parent: Hash,
from_validator: ValidatorIndex,
para_id: ParaId,
candidate_hash: CandidateHash,
pov_hash: Hash,
tx: oneshot::Sender<PoV>,
metrics: Metrics,
) -> Result<()> {
let info = &runtime.get_session_info(ctx.sender(), parent).await?.session_info;
let authority_id = info
.discovery_keys
.get(from_validator.0 as usize)
.ok_or(JfyiError::InvalidValidatorIndex)?
.clone();
let (req, pending_response) = OutgoingRequest::new(
Recipient::Authority(authority_id.clone()),
PoVFetchingRequest { candidate_hash },
);
let full_req = Requests::PoVFetchingV1(req);
ctx.send_message(NetworkBridgeTxMessage::SendRequests(
vec![full_req],
IfDisconnected::ImmediateError,
))
.await;
ctx.spawn(
"pov-fetcher",
fetch_pov_job(para_id, pov_hash, authority_id, pending_response.boxed(), tx, metrics)
.boxed(),
)
.map_err(|e| FatalError::SpawnTask(e))?;
Ok(())
}
/// Future to be spawned for taking care of handling reception and sending of PoV.
async fn fetch_pov_job(
para_id: ParaId,
pov_hash: Hash,
authority_id: AuthorityDiscoveryId,
pending_response: BoxFuture<'static, std::result::Result<PoVFetchingResponse, RequestError>>,
tx: oneshot::Sender<PoV>,
metrics: Metrics,
) {
if let Err(err) = do_fetch_pov(pov_hash, pending_response, tx, metrics).await {
gum::warn!(target: LOG_TARGET, ?err, ?para_id, ?pov_hash, ?authority_id, "fetch_pov_job");
}
}
/// Do the actual work of waiting for the response.
async fn do_fetch_pov(
pov_hash: Hash,
pending_response: BoxFuture<'static, std::result::Result<PoVFetchingResponse, RequestError>>,
tx: oneshot::Sender<PoV>,
metrics: Metrics,
) -> Result<()> {
let response = pending_response.await.map_err(Error::FetchPoV);
let pov = match response {
Ok(PoVFetchingResponse::PoV(pov)) => pov,
Ok(PoVFetchingResponse::NoSuchPoV) => {
metrics.on_fetched_pov(NOT_FOUND);
return Err(Error::NoSuchPoV);
},
Err(err) => {
metrics.on_fetched_pov(FAILED);
return Err(err);
},
};
if pov.hash() == pov_hash {
metrics.on_fetched_pov(SUCCEEDED);
tx.send(pov).map_err(|_| Error::SendResponse)
} else {
metrics.on_fetched_pov(FAILED);
Err(Error::UnexpectedPoV)
}
}
#[cfg(test)]
mod tests {
use assert_matches::assert_matches;
use futures::{executor, future};
use codec::Encode;
use sc_network::ProtocolName;
use sp_core::testing::TaskExecutor;
use pezkuwi_node_primitives::BlockData;
use pezkuwi_node_subsystem::messages::{
AllMessages, AvailabilityDistributionMessage, RuntimeApiMessage, RuntimeApiRequest,
};
use pezkuwi_node_subsystem_test_helpers as test_helpers;
use pezkuwi_primitives::{CandidateHash, ExecutorParams, Hash, NodeFeatures, ValidatorIndex};
use test_helpers::mock::make_ferdie_keystore;
use super::*;
use crate::{tests::mock::make_session_info, LOG_TARGET};
#[test]
fn rejects_invalid_pov() {
sp_tracing::try_init_simple();
let pov = PoV { block_data: BlockData(vec![1, 2, 3, 4, 5, 6]) };
test_run(Hash::default(), pov);
}
#[test]
fn accepts_valid_pov() {
sp_tracing::try_init_simple();
let pov = PoV { block_data: BlockData(vec![1, 2, 3, 4, 5, 6]) };
test_run(pov.hash(), pov);
}
fn test_run(pov_hash: Hash, pov: PoV) {
let pool = TaskExecutor::new();
let (mut context, mut virtual_overseer) =
pezkuwi_node_subsystem_test_helpers::make_subsystem_context::<
AvailabilityDistributionMessage,
TaskExecutor,
>(pool.clone());
let keystore = make_ferdie_keystore();
let mut runtime = pezkuwi_node_subsystem_util::runtime::RuntimeInfo::new(Some(keystore));
let (tx, rx) = oneshot::channel();
let testee = async {
fetch_pov(
&mut context,
&mut runtime,
Hash::default(),
ValidatorIndex(0),
ParaId::default(),
CandidateHash::default(),
pov_hash,
tx,
Metrics::new_dummy(),
)
.await
.expect("Should succeed");
};
let tester = async move {
loop {
match virtual_overseer.recv().await {
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
_,
RuntimeApiRequest::SessionIndexForChild(tx),
)) => {
tx.send(Ok(0)).unwrap();
},
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
_,
RuntimeApiRequest::SessionInfo(_, tx),
)) => {
tx.send(Ok(Some(make_session_info()))).unwrap();
},
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
_,
RuntimeApiRequest::SessionExecutorParams(_, tx),
)) => {
tx.send(Ok(Some(ExecutorParams::default()))).unwrap();
},
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
_,
RuntimeApiRequest::NodeFeatures(_, si_tx),
)) => {
si_tx.send(Ok(NodeFeatures::EMPTY)).unwrap();
},
AllMessages::NetworkBridgeTx(NetworkBridgeTxMessage::SendRequests(
mut reqs,
_,
)) => {
let req = assert_matches!(
reqs.pop(),
Some(Requests::PoVFetchingV1(outgoing)) => {outgoing}
);
req.pending_response
.send(Ok((
PoVFetchingResponse::PoV(pov.clone()).encode(),
ProtocolName::from(""),
)))
.unwrap();
break;
},
msg => gum::debug!(target: LOG_TARGET, msg = ?msg, "Received msg"),
}
}
if pov.hash() == pov_hash {
assert_eq!(rx.await, Ok(pov));
} else {
assert_eq!(rx.await, Err(oneshot::Canceled));
}
};
futures::pin_mut!(testee);
futures::pin_mut!(tester);
executor::block_on(future::join(testee, tester));
}
}
@@ -0,0 +1,560 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use std::collections::HashSet;
use futures::{
channel::{mpsc, oneshot},
future::select,
FutureExt, SinkExt,
};
use codec::Decode;
use pezkuwi_erasure_coding::branch_hash;
use pezkuwi_node_network_protocol::request_response::{
outgoing::{OutgoingRequest, Recipient, RequestError, Requests},
v1::{self, ChunkResponse},
v2,
};
use pezkuwi_node_primitives::ErasureChunk;
use pezkuwi_node_subsystem::{
messages::{AvailabilityStoreMessage, IfDisconnected, NetworkBridgeTxMessage},
overseer,
};
use pezkuwi_primitives::{
AuthorityDiscoveryId, BlakeTwo256, CandidateHash, ChunkIndex, GroupIndex, Hash, HashT,
OccupiedCore, SessionIndex,
};
use sc_network::ProtocolName;
use crate::{
error::{FatalError, Result},
metrics::{Metrics, FAILED, SUCCEEDED},
requester::session_cache::{BadValidators, SessionInfo},
LOG_TARGET,
};
#[cfg(test)]
mod tests;
/// Configuration for a `FetchTask`
///
/// This exists to separate preparation of a `FetchTask` from actual starting it, which is
/// beneficial as this allows as for taking session info by reference.
pub struct FetchTaskConfig {
prepared_running: Option<RunningTask>,
live_in: HashSet<Hash>,
}
/// Information about a task fetching an erasure chunk.
pub struct FetchTask {
/// For what relay parents this task is relevant.
///
/// In other words, for which relay chain parents this candidate is considered live.
/// This is updated on every `ActiveLeavesUpdate` and enables us to know when we can safely
/// stop keeping track of that candidate/chunk.
pub(crate) live_in: HashSet<Hash>,
/// We keep the task around in until `live_in` becomes empty, to make
/// sure we won't re-fetch an already fetched candidate.
state: FetchedState,
}
/// State of a particular candidate chunk fetching process.
enum FetchedState {
/// Chunk fetch has started.
///
/// Once the contained `Sender` is dropped, any still running task will be canceled.
Started(oneshot::Sender<()>),
/// All relevant `live_in` have been removed, before we were able to get our chunk.
Canceled,
}
/// Messages sent from `FetchTask`s to be handled/forwarded.
pub enum FromFetchTask {
/// Message to other subsystem.
Message(overseer::AvailabilityDistributionOutgoingMessages),
/// Concluded with result.
///
/// In case of `None` everything was fine, in case of `Some`, some validators in the group
/// did not serve us our chunk as expected.
Concluded(Option<BadValidators>),
/// We were not able to fetch the desired chunk for the given `CandidateHash`.
Failed(CandidateHash),
}
/// Information a running task needs.
struct RunningTask {
/// For what session we have been spawned.
session_index: SessionIndex,
/// Index of validator group to fetch the chunk from.
///
/// Needed for reporting bad validators.
group_index: GroupIndex,
/// Validators to request the chunk from.
///
/// This vector gets drained during execution of the task (it will be empty afterwards).
group: Vec<AuthorityDiscoveryId>,
/// The request to send. We can store it as either v1 or v2, they have the same payload.
request: v2::ChunkFetchingRequest,
/// Root hash, for verifying the chunks validity.
erasure_root: Hash,
/// Relay parent of the candidate to fetch.
relay_parent: Hash,
/// Sender for communicating with other subsystems and reporting results.
sender: mpsc::Sender<FromFetchTask>,
/// Prometheus metrics for reporting results.
metrics: Metrics,
/// Expected chunk index. We'll validate that the remote did send us the correct chunk (only
/// important for v2 requests).
chunk_index: ChunkIndex,
/// Full protocol name for ChunkFetchingV1.
req_v1_protocol_name: ProtocolName,
/// Full protocol name for ChunkFetchingV2.
req_v2_protocol_name: ProtocolName,
}
impl FetchTaskConfig {
/// Create a new configuration for a [`FetchTask`].
///
/// The result of this function can be passed into [`FetchTask::start`].
pub fn new(
leaf: Hash,
core: &OccupiedCore,
sender: mpsc::Sender<FromFetchTask>,
metrics: Metrics,
session_info: &SessionInfo,
chunk_index: ChunkIndex,
req_v1_protocol_name: ProtocolName,
req_v2_protocol_name: ProtocolName,
) -> Self {
let live_in = vec![leaf].into_iter().collect();
// Don't run tasks for our backing group:
if session_info.our_group == Some(core.group_responsible) {
return FetchTaskConfig { live_in, prepared_running: None };
}
let prepared_running = RunningTask {
session_index: session_info.session_index,
group_index: core.group_responsible,
group: session_info.validator_groups.get(core.group_responsible.0 as usize)
.expect("The responsible group of a candidate should be available in the corresponding session. qed.")
.clone(),
request: v2::ChunkFetchingRequest {
candidate_hash: core.candidate_hash,
index: session_info.our_index,
},
erasure_root: core.candidate_descriptor.erasure_root(),
relay_parent: core.candidate_descriptor.relay_parent(),
metrics,
sender,
chunk_index,
req_v1_protocol_name,
req_v2_protocol_name
};
FetchTaskConfig { live_in, prepared_running: Some(prepared_running) }
}
}
#[overseer::contextbounds(AvailabilityDistribution, prefix = self::overseer)]
impl FetchTask {
/// Start fetching a chunk.
///
/// A task handling the fetching of the configured chunk will be spawned.
pub async fn start<Context>(config: FetchTaskConfig, ctx: &mut Context) -> Result<Self> {
let FetchTaskConfig { prepared_running, live_in } = config;
if let Some(running) = prepared_running {
let (handle, kill) = oneshot::channel();
ctx.spawn("chunk-fetcher", running.run(kill).boxed())
.map_err(|e| FatalError::SpawnTask(e))?;
Ok(FetchTask { live_in, state: FetchedState::Started(handle) })
} else {
Ok(FetchTask { live_in, state: FetchedState::Canceled })
}
}
/// Add the given leaf to the relay parents which are making this task relevant.
///
/// This is for book keeping, so we know we are already fetching a given chunk.
pub fn add_leaf(&mut self, leaf: Hash) {
self.live_in.insert(leaf);
}
/// Remove leaves and cancel the task, if it was the last one and the task has still been
/// fetching.
pub fn remove_leaves(&mut self, leaves: &HashSet<Hash>) {
for leaf in leaves {
self.live_in.remove(leaf);
}
if self.live_in.is_empty() && !self.is_finished() {
self.state = FetchedState::Canceled
}
}
/// Whether there are still relay parents around with this candidate pending
/// availability.
pub fn is_live(&self) -> bool {
!self.live_in.is_empty()
}
/// Whether this task can be considered finished.
///
/// That is, it is either canceled, succeeded or failed.
pub fn is_finished(&self) -> bool {
match &self.state {
FetchedState::Canceled => true,
FetchedState::Started(sender) => sender.is_canceled(),
}
}
}
/// Things that can go wrong in task execution.
#[derive(Debug)]
enum TaskError {
/// The peer failed to deliver a correct chunk for some reason (has been reported as
/// appropriate).
PeerError,
/// This very node is seemingly shutting down (sending of message failed).
ShuttingDown,
}
impl RunningTask {
async fn run(self, kill: oneshot::Receiver<()>) {
// Wait for completion/or cancel.
let run_it = self.run_inner();
futures::pin_mut!(run_it);
let _ = select(run_it, kill).await;
}
/// Fetch and store chunk.
///
/// Try validators in backing group in order.
async fn run_inner(mut self) {
let mut bad_validators = Vec::new();
let mut succeeded = false;
let mut count: u32 = 0;
let mut network_error_freq = gum::Freq::new();
let mut canceled_freq = gum::Freq::new();
// Try validators in reverse order:
while let Some(validator) = self.group.pop() {
// Report retries:
if count > 0 {
self.metrics.on_retry();
}
count += 1;
// Send request:
let resp = match self
.do_request(&validator, &mut network_error_freq, &mut canceled_freq)
.await
{
Ok(resp) => resp,
Err(TaskError::ShuttingDown) => {
gum::info!(
target: LOG_TARGET,
"Node seems to be shutting down, canceling fetch task"
);
self.metrics.on_fetch(FAILED);
return;
},
Err(TaskError::PeerError) => {
bad_validators.push(validator);
continue;
},
};
let chunk = match resp {
Some(chunk) => chunk,
None => {
gum::debug!(
target: LOG_TARGET,
validator = ?validator,
relay_parent = ?self.relay_parent,
group_index = ?self.group_index,
session_index = ?self.session_index,
chunk_index = ?self.request.index,
candidate_hash = ?self.request.candidate_hash,
"Validator did not have our chunk"
);
bad_validators.push(validator);
continue;
},
};
// Data genuine?
if !self.validate_chunk(&validator, &chunk, self.chunk_index) {
bad_validators.push(validator);
continue;
}
// Ok, let's store it and be happy:
self.store_chunk(chunk).await;
succeeded = true;
break;
}
if succeeded {
self.metrics.on_fetch(SUCCEEDED);
self.conclude(bad_validators).await;
} else {
self.metrics.on_fetch(FAILED);
self.conclude_fail().await
}
}
/// Do request and return response, if successful.
async fn do_request(
&mut self,
validator: &AuthorityDiscoveryId,
network_error_freq: &mut gum::Freq,
canceled_freq: &mut gum::Freq,
) -> std::result::Result<Option<ErasureChunk>, TaskError> {
gum::trace!(
target: LOG_TARGET,
origin = ?validator,
relay_parent = ?self.relay_parent,
group_index = ?self.group_index,
session_index = ?self.session_index,
chunk_index = ?self.request.index,
candidate_hash = ?self.request.candidate_hash,
"Starting chunk request",
);
let (full_request, response_recv) = OutgoingRequest::new_with_fallback(
Recipient::Authority(validator.clone()),
self.request,
// Fallback to v1, for backwards compatibility.
v1::ChunkFetchingRequest::from(self.request),
);
let requests = Requests::ChunkFetching(full_request);
self.sender
.send(FromFetchTask::Message(
NetworkBridgeTxMessage::SendRequests(
vec![requests],
IfDisconnected::ImmediateError,
)
.into(),
))
.await
.map_err(|_| TaskError::ShuttingDown)?;
match response_recv.await {
Ok((bytes, protocol)) => match protocol {
_ if protocol == self.req_v2_protocol_name =>
match v2::ChunkFetchingResponse::decode(&mut &bytes[..]) {
Ok(chunk_response) => Ok(Option::<ErasureChunk>::from(chunk_response)),
Err(e) => {
gum::warn!(
target: LOG_TARGET,
origin = ?validator,
relay_parent = ?self.relay_parent,
group_index = ?self.group_index,
session_index = ?self.session_index,
chunk_index = ?self.request.index,
candidate_hash = ?self.request.candidate_hash,
err = ?e,
"Peer sent us invalid erasure chunk data (v2)"
);
Err(TaskError::PeerError)
},
},
_ if protocol == self.req_v1_protocol_name =>
match v1::ChunkFetchingResponse::decode(&mut &bytes[..]) {
Ok(chunk_response) => Ok(Option::<ChunkResponse>::from(chunk_response)
.map(|c| c.recombine_into_chunk(&self.request.into()))),
Err(e) => {
gum::warn!(
target: LOG_TARGET,
origin = ?validator,
relay_parent = ?self.relay_parent,
group_index = ?self.group_index,
session_index = ?self.session_index,
chunk_index = ?self.request.index,
candidate_hash = ?self.request.candidate_hash,
err = ?e,
"Peer sent us invalid erasure chunk data"
);
Err(TaskError::PeerError)
},
},
_ => {
gum::warn!(
target: LOG_TARGET,
origin = ?validator,
relay_parent = ?self.relay_parent,
group_index = ?self.group_index,
session_index = ?self.session_index,
chunk_index = ?self.request.index,
candidate_hash = ?self.request.candidate_hash,
"Peer sent us invalid erasure chunk data - unknown protocol"
);
Err(TaskError::PeerError)
},
},
Err(RequestError::InvalidResponse(err)) => {
gum::warn!(
target: LOG_TARGET,
origin = ?validator,
relay_parent = ?self.relay_parent,
group_index = ?self.group_index,
session_index = ?self.session_index,
chunk_index = ?self.request.index,
candidate_hash = ?self.request.candidate_hash,
err = ?err,
"Peer sent us invalid erasure chunk data"
);
Err(TaskError::PeerError)
},
Err(RequestError::NetworkError(err)) => {
gum::warn_if_frequent!(
freq: network_error_freq,
max_rate: gum::Times::PerHour(100),
target: LOG_TARGET,
origin = ?validator,
relay_parent = ?self.relay_parent,
group_index = ?self.group_index,
session_index = ?self.session_index,
chunk_index = ?self.request.index,
candidate_hash = ?self.request.candidate_hash,
err = ?err,
"Some network error occurred when fetching erasure chunk"
);
Err(TaskError::PeerError)
},
Err(RequestError::Canceled(oneshot::Canceled)) => {
gum::warn_if_frequent!(
freq: canceled_freq,
max_rate: gum::Times::PerHour(100),
target: LOG_TARGET,
origin = ?validator,
relay_parent = ?self.relay_parent,
group_index = ?self.group_index,
session_index = ?self.session_index,
chunk_index = ?self.request.index,
candidate_hash = ?self.request.candidate_hash,
"Erasure chunk request got canceled"
);
Err(TaskError::PeerError)
},
}
}
fn validate_chunk(
&self,
validator: &AuthorityDiscoveryId,
chunk: &ErasureChunk,
expected_chunk_index: ChunkIndex,
) -> bool {
if chunk.index != expected_chunk_index {
gum::warn!(
target: LOG_TARGET,
candidate_hash = ?self.request.candidate_hash,
origin = ?validator,
chunk_index = ?chunk.index,
expected_chunk_index = ?expected_chunk_index,
"Validator sent the wrong chunk",
);
return false;
}
let anticipated_hash =
match branch_hash(&self.erasure_root, chunk.proof(), chunk.index.0 as usize) {
Ok(hash) => hash,
Err(e) => {
gum::warn!(
target: LOG_TARGET,
candidate_hash = ?self.request.candidate_hash,
origin = ?validator,
error = ?e,
"Failed to calculate chunk merkle proof",
);
return false;
},
};
let erasure_chunk_hash = BlakeTwo256::hash(&chunk.chunk);
if anticipated_hash != erasure_chunk_hash {
gum::warn!(target: LOG_TARGET, origin = ?validator, "Received chunk does not match merkle tree");
return false;
}
true
}
/// Store given chunk and log any error.
async fn store_chunk(&mut self, chunk: ErasureChunk) {
let (tx, rx) = oneshot::channel();
let r = self
.sender
.send(FromFetchTask::Message(
AvailabilityStoreMessage::StoreChunk {
candidate_hash: self.request.candidate_hash,
chunk,
validator_index: self.request.index,
tx,
}
.into(),
))
.await;
if let Err(err) = r {
gum::error!(target: LOG_TARGET, err= ?err, "Storing erasure chunk failed, system shutting down?");
}
if let Err(oneshot::Canceled) = rx.await {
gum::error!(target: LOG_TARGET, "Storing erasure chunk failed");
}
}
/// Tell subsystem we are done.
async fn conclude(&mut self, bad_validators: Vec<AuthorityDiscoveryId>) {
let payload = if bad_validators.is_empty() {
None
} else {
Some(BadValidators {
session_index: self.session_index,
group_index: self.group_index,
bad_validators,
})
};
if let Err(err) = self.sender.send(FromFetchTask::Concluded(payload)).await {
gum::warn!(
target: LOG_TARGET,
err= ?err,
"Sending concluded message for task failed"
);
}
}
async fn conclude_fail(&mut self) {
if let Err(err) = self.sender.send(FromFetchTask::Failed(self.request.candidate_hash)).await
{
gum::warn!(target: LOG_TARGET, ?err, "Sending `Failed` message for task failed");
}
}
}
@@ -0,0 +1,400 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use std::collections::HashMap;
use codec::Encode;
use futures::{
channel::{mpsc, oneshot},
executor, select,
task::{noop_waker, Context, Poll},
Future, FutureExt, StreamExt,
};
use rstest::rstest;
use sc_network::{self as network, ProtocolName};
use sp_keyring::Sr25519Keyring;
use pezkuwi_node_network_protocol::request_response::{
v1::{self, ChunkResponse},
Protocol, Recipient, ReqProtocolNames,
};
use pezkuwi_node_primitives::{BlockData, PoV, Proof};
use pezkuwi_node_subsystem::messages::AllMessages;
use pezkuwi_primitives::{CandidateHash, ChunkIndex, ValidatorIndex};
use super::*;
use crate::{metrics::Metrics, tests::mock::get_valid_chunk_data};
#[test]
fn task_can_be_canceled() {
let req_protocol_names = ReqProtocolNames::new(&Hash::repeat_byte(0xff), None);
let (task, _rx) = get_test_running_task(&req_protocol_names, 0.into(), 0.into());
let (handle, kill) = oneshot::channel();
std::mem::drop(handle);
let running_task = task.run(kill);
futures::pin_mut!(running_task);
let waker = noop_waker();
let mut ctx = Context::from_waker(&waker);
assert!(running_task.poll(&mut ctx) == Poll::Ready(()), "Task is immediately finished");
}
/// Make sure task won't accept a chunk that has is invalid.
#[rstest]
#[case(Protocol::ChunkFetchingV1)]
#[case(Protocol::ChunkFetchingV2)]
fn task_does_not_accept_invalid_chunk(#[case] protocol: Protocol) {
let req_protocol_names = ReqProtocolNames::new(&Hash::repeat_byte(0xff), None);
let chunk_index = ChunkIndex(1);
let validator_index = ValidatorIndex(0);
let (mut task, rx) = get_test_running_task(&req_protocol_names, validator_index, chunk_index);
let validators = vec![Sr25519Keyring::Alice.public().into()];
task.group = validators;
let protocol_name = req_protocol_names.get_name(protocol);
let test = TestRun {
chunk_responses: {
[(
Recipient::Authority(Sr25519Keyring::Alice.public().into()),
get_response(
protocol,
protocol_name.clone(),
Some((
vec![1, 2, 3],
Proof::try_from(vec![vec![9, 8, 2], vec![2, 3, 4]]).unwrap(),
chunk_index,
)),
),
)]
.into_iter()
.collect()
},
valid_chunks: HashSet::new(),
req_protocol_names,
};
test.run(task, rx);
}
#[rstest]
#[case(Protocol::ChunkFetchingV1)]
#[case(Protocol::ChunkFetchingV2)]
fn task_stores_valid_chunk(#[case] protocol: Protocol) {
let req_protocol_names = ReqProtocolNames::new(&Hash::repeat_byte(0xff), None);
// In order for protocol version 1 to work, the chunk index needs to be equal to the validator
// index.
let chunk_index = ChunkIndex(0);
let validator_index =
if protocol == Protocol::ChunkFetchingV1 { ValidatorIndex(0) } else { ValidatorIndex(1) };
let (mut task, rx) = get_test_running_task(&req_protocol_names, validator_index, chunk_index);
let validators = vec![Sr25519Keyring::Alice.public().into()];
let pov = PoV { block_data: BlockData(vec![45, 46, 47]) };
let (root_hash, chunk) = get_valid_chunk_data(pov, 10, chunk_index);
task.erasure_root = root_hash;
task.group = validators;
let protocol_name = req_protocol_names.get_name(protocol);
let test = TestRun {
chunk_responses: {
[(
Recipient::Authority(Sr25519Keyring::Alice.public().into()),
get_response(
protocol,
protocol_name.clone(),
Some((chunk.chunk.clone(), chunk.proof, chunk_index)),
),
)]
.into_iter()
.collect()
},
valid_chunks: [(chunk.chunk)].into_iter().collect(),
req_protocol_names,
};
test.run(task, rx);
}
#[rstest]
#[case(Protocol::ChunkFetchingV1)]
#[case(Protocol::ChunkFetchingV2)]
fn task_does_not_accept_wrongly_indexed_chunk(#[case] protocol: Protocol) {
let req_protocol_names = ReqProtocolNames::new(&Hash::repeat_byte(0xff), None);
// In order for protocol version 1 to work, the chunk index needs to be equal to the validator
// index.
let chunk_index = ChunkIndex(0);
let validator_index =
if protocol == Protocol::ChunkFetchingV1 { ValidatorIndex(0) } else { ValidatorIndex(1) };
let (mut task, rx) = get_test_running_task(&req_protocol_names, validator_index, chunk_index);
let validators = vec![Sr25519Keyring::Alice.public().into()];
let pov = PoV { block_data: BlockData(vec![45, 46, 47]) };
let (_, other_chunk) = get_valid_chunk_data(pov.clone(), 10, ChunkIndex(3));
let (root_hash, chunk) = get_valid_chunk_data(pov, 10, ChunkIndex(0));
task.erasure_root = root_hash;
task.request.index = chunk.index.into();
task.group = validators;
let protocol_name = req_protocol_names.get_name(protocol);
let test = TestRun {
chunk_responses: {
[(
Recipient::Authority(Sr25519Keyring::Alice.public().into()),
get_response(
protocol,
protocol_name.clone(),
Some((other_chunk.chunk.clone(), chunk.proof, other_chunk.index)),
),
)]
.into_iter()
.collect()
},
valid_chunks: HashSet::new(),
req_protocol_names,
};
test.run(task, rx);
}
/// Task stores chunk, if there is at least one validator having a valid chunk.
#[rstest]
#[case(Protocol::ChunkFetchingV1)]
#[case(Protocol::ChunkFetchingV2)]
fn task_stores_valid_chunk_if_there_is_one(#[case] protocol: Protocol) {
let req_protocol_names = ReqProtocolNames::new(&Hash::repeat_byte(0xff), None);
// In order for protocol version 1 to work, the chunk index needs to be equal to the validator
// index.
let chunk_index = ChunkIndex(1);
let validator_index =
if protocol == Protocol::ChunkFetchingV1 { ValidatorIndex(1) } else { ValidatorIndex(2) };
let (mut task, rx) = get_test_running_task(&req_protocol_names, validator_index, chunk_index);
let pov = PoV { block_data: BlockData(vec![45, 46, 47]) };
let validators = [
// Only Alice has valid chunk - should succeed, even though she is tried last.
Sr25519Keyring::Alice,
Sr25519Keyring::Bob,
Sr25519Keyring::Charlie,
Sr25519Keyring::Dave,
Sr25519Keyring::Eve,
]
.iter()
.map(|v| v.public().into())
.collect::<Vec<_>>();
let (root_hash, chunk) = get_valid_chunk_data(pov, 10, chunk_index);
task.erasure_root = root_hash;
task.group = validators;
let protocol_name = req_protocol_names.get_name(protocol);
let test = TestRun {
chunk_responses: {
[
(
Recipient::Authority(Sr25519Keyring::Alice.public().into()),
get_response(
protocol,
protocol_name.clone(),
Some((chunk.chunk.clone(), chunk.proof, chunk_index)),
),
),
(
Recipient::Authority(Sr25519Keyring::Bob.public().into()),
get_response(protocol, protocol_name.clone(), None),
),
(
Recipient::Authority(Sr25519Keyring::Charlie.public().into()),
get_response(
protocol,
protocol_name.clone(),
Some((
vec![1, 2, 3],
Proof::try_from(vec![vec![9, 8, 2], vec![2, 3, 4]]).unwrap(),
chunk_index,
)),
),
),
]
.into_iter()
.collect()
},
valid_chunks: [(chunk.chunk)].into_iter().collect(),
req_protocol_names,
};
test.run(task, rx);
}
struct TestRun {
/// Response to deliver for a given validator index.
/// None means, answer with `NetworkError`.
chunk_responses: HashMap<Recipient, (Vec<u8>, ProtocolName)>,
/// Set of chunks that should be considered valid:
valid_chunks: HashSet<Vec<u8>>,
/// Request protocol names
req_protocol_names: ReqProtocolNames,
}
impl TestRun {
fn run(self, task: RunningTask, rx: mpsc::Receiver<FromFetchTask>) {
sp_tracing::init_for_tests();
let mut rx = rx.fuse();
let task = task.run_inner().fuse();
futures::pin_mut!(task);
executor::block_on(async {
let mut end_ok = false;
loop {
let msg = select!(
from_task = rx.next() => {
match from_task {
Some(msg) => msg,
None => break,
}
},
() = task =>
break,
);
match msg {
FromFetchTask::Concluded(_) => break,
FromFetchTask::Failed(_) => break,
FromFetchTask::Message(msg) => end_ok = self.handle_message(msg).await,
}
}
if !end_ok {
panic!("Task ended prematurely (failed to store valid chunk)!");
}
});
}
/// Returns true, if after processing of the given message it would be OK for the stream to
/// end.
async fn handle_message(
&self,
msg: overseer::AvailabilityDistributionOutgoingMessages,
) -> bool {
let msg = AllMessages::from(msg);
match msg {
AllMessages::NetworkBridgeTx(NetworkBridgeTxMessage::SendRequests(
reqs,
IfDisconnected::ImmediateError,
)) => {
let mut valid_responses = 0;
for req in reqs {
let req = match req {
Requests::ChunkFetching(req) => req,
_ => panic!("Unexpected request"),
};
let response =
self.chunk_responses.get(&req.peer).ok_or(network::RequestFailure::Refused);
if let Ok((resp, protocol)) = response {
let chunk = if protocol ==
&self.req_protocol_names.get_name(Protocol::ChunkFetchingV1)
{
Into::<Option<v1::ChunkResponse>>::into(
v1::ChunkFetchingResponse::decode(&mut &resp[..]).unwrap(),
)
.map(|c| c.chunk)
} else if protocol ==
&self.req_protocol_names.get_name(Protocol::ChunkFetchingV2)
{
Into::<Option<ErasureChunk>>::into(
v2::ChunkFetchingResponse::decode(&mut &resp[..]).unwrap(),
)
.map(|c| c.chunk)
} else {
unreachable!()
};
if let Some(chunk) = chunk {
if self.valid_chunks.contains(&chunk) {
valid_responses += 1;
}
}
req.pending_response
.send(response.cloned())
.expect("Sending response should succeed");
}
}
return (valid_responses == 0) && self.valid_chunks.is_empty();
},
AllMessages::AvailabilityStore(AvailabilityStoreMessage::StoreChunk {
chunk,
tx,
..
}) => {
assert!(self.valid_chunks.contains(&chunk.chunk));
tx.send(Ok(())).expect("Answering fetching task should work");
return true;
},
_ => {
gum::debug!(target: LOG_TARGET, "Unexpected message");
return false;
},
}
}
}
/// Get a `RunningTask` filled with (mostly) dummy values.
fn get_test_running_task(
req_protocol_names: &ReqProtocolNames,
validator_index: ValidatorIndex,
chunk_index: ChunkIndex,
) -> (RunningTask, mpsc::Receiver<FromFetchTask>) {
let (tx, rx) = mpsc::channel(0);
(
RunningTask {
session_index: 0,
group_index: GroupIndex(0),
group: Vec::new(),
request: v2::ChunkFetchingRequest {
candidate_hash: CandidateHash([43u8; 32].into()),
index: validator_index,
},
erasure_root: Hash::repeat_byte(99),
relay_parent: Hash::repeat_byte(71),
sender: tx,
metrics: Metrics::new_dummy(),
req_v1_protocol_name: req_protocol_names.get_name(Protocol::ChunkFetchingV1),
req_v2_protocol_name: req_protocol_names.get_name(Protocol::ChunkFetchingV2),
chunk_index,
},
rx,
)
}
/// Make a versioned ChunkFetchingResponse.
fn get_response(
protocol: Protocol,
protocol_name: ProtocolName,
chunk: Option<(Vec<u8>, Proof, ChunkIndex)>,
) -> (Vec<u8>, ProtocolName) {
(
match protocol {
Protocol::ChunkFetchingV1 => if let Some((chunk, proof, _)) = chunk {
v1::ChunkFetchingResponse::Chunk(ChunkResponse { chunk, proof })
} else {
v1::ChunkFetchingResponse::NoSuchChunk
}
.encode(),
Protocol::ChunkFetchingV2 => if let Some((chunk, proof, index)) = chunk {
v2::ChunkFetchingResponse::Chunk(ErasureChunk { chunk, index, proof })
} else {
v2::ChunkFetchingResponse::NoSuchChunk
}
.encode(),
_ => unreachable!(),
},
protocol_name,
)
}
@@ -0,0 +1,349 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//! Requester takes care of requesting erasure chunks for candidates that are pending
//! availability.
use std::{
collections::{hash_map::HashMap, hash_set::HashSet},
iter::IntoIterator,
pin::Pin,
};
use futures::{
channel::{mpsc, oneshot},
task::{Context, Poll},
Stream,
};
use pezkuwi_node_network_protocol::request_response::{v1, v2, IsRequest, ReqProtocolNames};
use pezkuwi_node_subsystem::{
messages::{ChainApiMessage, RuntimeApiMessage},
overseer, ActivatedLeaf, ActiveLeavesUpdate,
};
use pezkuwi_node_subsystem_util::{
availability_chunks::availability_chunk_index,
runtime::{get_occupied_cores, RuntimeInfo},
};
use pezkuwi_primitives::{CandidateHash, CoreIndex, Hash, OccupiedCore, SessionIndex};
use super::{FatalError, Metrics, Result, LOG_TARGET};
#[cfg(test)]
mod tests;
/// Cache for session information.
mod session_cache;
use session_cache::SessionCache;
/// A task fetching a particular chunk.
mod fetch_task;
use fetch_task::{FetchTask, FetchTaskConfig, FromFetchTask};
/// Requester takes care of requesting erasure chunks from backing groups and stores them in the
/// av store.
///
/// It implements a stream that needs to be advanced for it making progress.
pub struct Requester {
/// Candidates we need to fetch our chunk for.
///
/// We keep those around as long as a candidate is pending availability on some leaf, so we
/// won't fetch chunks multiple times.
///
/// We remove them on failure, so we get retries on the next block still pending availability.
fetches: HashMap<CandidateHash, FetchTask>,
/// Localized information about sessions we are currently interested in.
session_cache: SessionCache,
/// Sender to be cloned for `FetchTask`s.
tx: mpsc::Sender<FromFetchTask>,
/// Receive messages from `FetchTask`.
rx: mpsc::Receiver<FromFetchTask>,
/// Prometheus Metrics
metrics: Metrics,
/// Mapping of the req-response protocols to the full protocol names.
req_protocol_names: ReqProtocolNames,
}
#[overseer::contextbounds(AvailabilityDistribution, prefix = self::overseer)]
impl Requester {
/// How many ancestors of the leaf should we consider along with it.
pub(crate) const LEAF_ANCESTRY_LEN_WITHIN_SESSION: usize = 3;
/// Create a new `Requester`.
///
/// You must feed it with `ActiveLeavesUpdate` via `update_fetching_heads` and make it progress
/// by advancing the stream.
pub fn new(req_protocol_names: ReqProtocolNames, metrics: Metrics) -> Self {
let (tx, rx) = mpsc::channel(1);
Requester {
fetches: HashMap::new(),
session_cache: SessionCache::new(),
tx,
rx,
metrics,
req_protocol_names,
}
}
/// Update heads that need availability distribution.
///
/// For all active heads we will be fetching our chunks for availability distribution.
pub async fn update_fetching_heads<Context>(
&mut self,
ctx: &mut Context,
runtime: &mut RuntimeInfo,
update: ActiveLeavesUpdate,
) -> Result<()> {
gum::trace!(target: LOG_TARGET, ?update, "Update fetching heads");
let ActiveLeavesUpdate { activated, deactivated } = update;
if let Some(leaf) = activated {
// Order important! We need to handle activated, prior to deactivated, otherwise we
// might cancel still needed jobs.
self.start_requesting_chunks(ctx, runtime, leaf).await?;
}
self.stop_requesting_chunks(deactivated.into_iter());
Ok(())
}
/// Start requesting chunks for newly imported head.
///
/// This will also request [`SESSION_ANCESTRY_LEN`] leaf ancestors from the same session
/// and start requesting chunks for them too.
async fn start_requesting_chunks<Context>(
&mut self,
ctx: &mut Context,
runtime: &mut RuntimeInfo,
new_head: ActivatedLeaf,
) -> Result<()> {
let sender = &mut ctx.sender().clone();
let ActivatedLeaf { hash: leaf, .. } = new_head;
let (leaf_session_index, ancestors_in_session) = get_block_ancestors_in_same_session(
sender,
runtime,
leaf,
Self::LEAF_ANCESTRY_LEN_WITHIN_SESSION,
)
.await?;
// Also spawn or bump tasks for candidates in ancestry in the same session.
for hash in std::iter::once(leaf).chain(ancestors_in_session) {
let cores = get_occupied_cores(sender, hash).await?;
gum::trace!(
target: LOG_TARGET,
occupied_cores = ?cores,
"Query occupied core"
);
// Important:
// We mark the whole ancestry as live in the **leaf** hash, so we don't need to track
// any tasks separately.
//
// The next time the subsystem receives leaf update, some of spawned task will be bumped
// to be live in fresh relay parent, while some might get dropped due to the current
// leaf being deactivated.
self.add_cores(ctx, runtime, leaf, leaf_session_index, cores).await?;
}
Ok(())
}
/// Stop requesting chunks for obsolete heads.
fn stop_requesting_chunks(&mut self, obsolete_leaves: impl Iterator<Item = Hash>) {
let obsolete_leaves: HashSet<_> = obsolete_leaves.collect();
self.fetches.retain(|_, task| {
task.remove_leaves(&obsolete_leaves);
task.is_live()
})
}
/// Add candidates corresponding for a particular relay parent.
///
/// Starting requests where necessary.
///
/// Note: The passed in `leaf` is not the same as `CandidateDescriptor::relay_parent` in the
/// given cores. The latter is the `relay_parent` this candidate considers its parent, while the
/// passed in leaf might be some later block where the candidate is still pending availability.
async fn add_cores<Context>(
&mut self,
context: &mut Context,
runtime: &mut RuntimeInfo,
leaf: Hash,
leaf_session_index: SessionIndex,
cores: impl IntoIterator<Item = (CoreIndex, OccupiedCore)>,
) -> Result<()> {
for (core_index, core) in cores {
if let Some(e) = self.fetches.get_mut(&core.candidate_hash) {
// Just book keeping - we are already requesting that chunk:
e.add_leaf(leaf);
} else {
let tx = self.tx.clone();
let metrics = self.metrics.clone();
let session_info = self
.session_cache
.get_session_info(
context,
runtime,
// We use leaf here, the relay_parent must be in the same session as
// the leaf. This is guaranteed by runtime which ensures that cores are
// cleared at session boundaries. At the same time, only leaves are
// guaranteed to be fetchable by the state trie.
leaf,
leaf_session_index,
)
.await
.map_err(|err| {
gum::warn!(
target: LOG_TARGET,
error = ?err,
"Failed to spawn a fetch task"
);
err
})?;
if let Some(session_info) = session_info {
let n_validators =
session_info.validator_groups.iter().fold(0usize, |mut acc, group| {
acc = acc.saturating_add(group.len());
acc
});
let chunk_index = availability_chunk_index(
session_info.node_features.as_ref(),
n_validators,
core_index,
session_info.our_index,
)?;
let task_cfg = FetchTaskConfig::new(
leaf,
&core,
tx,
metrics,
session_info,
chunk_index,
self.req_protocol_names.get_name(v1::ChunkFetchingRequest::PROTOCOL),
self.req_protocol_names.get_name(v2::ChunkFetchingRequest::PROTOCOL),
);
self.fetches
.insert(core.candidate_hash, FetchTask::start(task_cfg, context).await?);
}
}
}
Ok(())
}
}
impl Stream for Requester {
type Item = overseer::AvailabilityDistributionOutgoingMessages;
fn poll_next(mut self: Pin<&mut Self>, ctx: &mut Context) -> Poll<Option<Self::Item>> {
loop {
match Pin::new(&mut self.rx).poll_next(ctx) {
Poll::Ready(Some(FromFetchTask::Message(m))) => return Poll::Ready(Some(m)),
Poll::Ready(Some(FromFetchTask::Concluded(Some(bad_boys)))) => {
self.session_cache.report_bad_log(bad_boys);
continue;
},
Poll::Ready(Some(FromFetchTask::Concluded(None))) => continue,
Poll::Ready(Some(FromFetchTask::Failed(candidate_hash))) => {
// Make sure we retry on next block still pending availability.
self.fetches.remove(&candidate_hash);
},
Poll::Ready(None) => return Poll::Ready(None),
Poll::Pending => return Poll::Pending,
}
}
}
}
/// Requests up to `limit` ancestor hashes of relay parent in the same session.
///
/// Also returns session index of the `head`.
async fn get_block_ancestors_in_same_session<Sender>(
sender: &mut Sender,
runtime: &mut RuntimeInfo,
head: Hash,
limit: usize,
) -> Result<(SessionIndex, Vec<Hash>)>
where
Sender:
overseer::SubsystemSender<RuntimeApiMessage> + overseer::SubsystemSender<ChainApiMessage>,
{
// The order is parent, grandparent, ...
//
// `limit + 1` since a session index for the last element in ancestry
// is obtained through its parent. It always gets truncated because
// `session_ancestry_len` can only be incremented `ancestors.len() - 1` times.
let mut ancestors = get_block_ancestors(sender, head, limit + 1).await?;
let mut ancestors_iter = ancestors.iter();
// `head` is the child of the first block in `ancestors`, request its session index.
let head_session_index = match ancestors_iter.next() {
Some(parent) => runtime.get_session_index_for_child(sender, *parent).await?,
None => {
// No first element, i.e. empty.
return Ok((0, ancestors));
},
};
let mut session_ancestry_len = 0;
// The first parent is skipped.
for parent in ancestors_iter {
// Parent is the i-th ancestor, request session index for its child -- (i-1)th element.
let session_index = runtime.get_session_index_for_child(sender, *parent).await?;
if session_index == head_session_index {
session_ancestry_len += 1;
} else {
break;
}
}
// Drop the rest.
ancestors.truncate(session_ancestry_len);
Ok((head_session_index, ancestors))
}
/// Request up to `limit` ancestor hashes of relay parent from the Chain API.
async fn get_block_ancestors<Sender>(
sender: &mut Sender,
relay_parent: Hash,
limit: usize,
) -> Result<Vec<Hash>>
where
Sender: overseer::SubsystemSender<ChainApiMessage>,
{
let (tx, rx) = oneshot::channel();
sender
.send_message(ChainApiMessage::Ancestors {
hash: relay_parent,
k: limit,
response_channel: tx,
})
.await;
let ancestors = rx
.await
.map_err(FatalError::ChainApiSenderDropped)?
.map_err(FatalError::ChainApi)?;
Ok(ancestors)
}
@@ -0,0 +1,221 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use std::collections::HashSet;
use rand::{seq::SliceRandom, thread_rng};
use schnellru::{ByLength, LruMap};
use pezkuwi_node_subsystem::overseer;
use pezkuwi_node_subsystem_util::{request_node_features, runtime::RuntimeInfo};
use pezkuwi_primitives::{
AuthorityDiscoveryId, GroupIndex, Hash, NodeFeatures, SessionIndex, ValidatorIndex,
};
use crate::{
error::{Error, Result},
LOG_TARGET,
};
/// Caching of session info as needed by availability chunk distribution.
///
/// It should be ensured that a cached session stays live in the cache as long as we might need it.
pub struct SessionCache {
/// Look up cached sessions by `SessionIndex`.
///
/// Note: Performance of fetching is really secondary here, but we need to ensure we are going
/// to get any existing cache entry, before fetching new information, as we should not mess up
/// the order of validators in `SessionInfo::validator_groups`.
session_info_cache: LruMap<SessionIndex, SessionInfo>,
}
/// Localized session information, tailored for the needs of availability distribution.
#[derive(Clone)]
pub struct SessionInfo {
/// The index of this session.
pub session_index: SessionIndex,
/// Validator groups of the current session.
///
/// Each group's order is randomized. This way we achieve load balancing when requesting
/// chunks, as the validators in a group will be tried in that randomized order. Each node
/// should arrive at a different order, therefore we distribute the load on individual
/// validators.
pub validator_groups: Vec<Vec<AuthorityDiscoveryId>>,
/// Information about ourselves:
pub our_index: ValidatorIndex,
/// Remember to which group we belong, so we won't start fetching chunks for candidates with
/// our group being responsible. (We should have that chunk already.)
///
/// `None`, if we are not in fact part of any group.
pub our_group: Option<GroupIndex>,
/// Node features.
pub node_features: NodeFeatures,
}
/// Report of bad validators.
///
/// Fetching tasks will report back validators that did not respond as expected, so we can re-order
/// them.
pub struct BadValidators {
/// The session index that was used.
pub session_index: SessionIndex,
/// The group, the not properly responding validators belong to.
pub group_index: GroupIndex,
/// The list of bad validators.
pub bad_validators: Vec<AuthorityDiscoveryId>,
}
#[overseer::contextbounds(AvailabilityDistribution, prefix = self::overseer)]
impl SessionCache {
/// Create a new `SessionCache`.
pub fn new() -> Self {
SessionCache {
// We need to cache the current and the last session the most:
session_info_cache: LruMap::new(ByLength::new(2)),
}
}
/// Tries to retrieve `SessionInfo`.
/// If this node is not a validator, the function will return `None`.
pub async fn get_session_info<'a, Context>(
&'a mut self,
ctx: &mut Context,
runtime: &mut RuntimeInfo,
parent: Hash,
session_index: SessionIndex,
) -> Result<Option<&'a SessionInfo>> {
gum::trace!(target: LOG_TARGET, session_index, "Calling `get_session_info`");
if self.session_info_cache.get(&session_index).is_none() {
if let Some(info) =
Self::query_info_from_runtime(ctx, runtime, parent, session_index).await?
{
gum::trace!(target: LOG_TARGET, session_index, "Storing session info in lru!");
self.session_info_cache.insert(session_index, info);
} else {
return Ok(None);
}
}
Ok(self.session_info_cache.get(&session_index).map(|i| &*i))
}
/// Variant of `report_bad` that never fails, but just logs errors.
///
/// Not being able to report bad validators is not fatal, so we should not shutdown the
/// subsystem on this.
pub fn report_bad_log(&mut self, report: BadValidators) {
if let Err(err) = self.report_bad(report) {
gum::warn!(
target: LOG_TARGET,
err = ?err,
"Reporting bad validators failed with error"
);
}
}
/// Make sure we try unresponsive or misbehaving validators last.
///
/// We assume validators in a group are tried in reverse order, so the reported bad validators
/// will be put at the beginning of the group.
pub fn report_bad(&mut self, report: BadValidators) -> Result<()> {
let available_sessions = self.session_info_cache.iter().map(|(k, _)| *k).collect();
let session = self.session_info_cache.get(&report.session_index).ok_or(
Error::NoSuchCachedSession {
available_sessions,
missing_session: report.session_index,
},
)?;
let group = session.validator_groups.get_mut(report.group_index.0 as usize).expect(
"A bad validator report must contain a valid group for the reported session. qed.",
);
let bad_set = report.bad_validators.iter().collect::<HashSet<_>>();
// Get rid of bad boys:
group.retain(|v| !bad_set.contains(v));
// We are trying validators in reverse order, so bad ones should be first:
let mut new_group = report.bad_validators;
new_group.append(group);
*group = new_group;
Ok(())
}
/// Query needed information from runtime.
///
/// We need to pass in the relay parent for our call to `request_session_info`. We should
/// actually don't need that: I suppose it is used for internal caching based on relay parents,
/// which we don't use here. It should not do any harm though.
///
/// Returns: `None` if not a validator.
async fn query_info_from_runtime<Context>(
ctx: &mut Context,
runtime: &mut RuntimeInfo,
relay_parent: Hash,
session_index: SessionIndex,
) -> Result<Option<SessionInfo>> {
let info = runtime
.get_session_info_by_index(ctx.sender(), relay_parent, session_index)
.await?;
let node_features = request_node_features(relay_parent, session_index, ctx.sender())
.await
.await?
.map_err(Error::FailedNodeFeatures)?;
let discovery_keys = info.session_info.discovery_keys.clone();
let mut validator_groups = info.session_info.validator_groups.clone();
if let Some(our_index) = info.validator_info.our_index {
// Get our group index:
let our_group = info.validator_info.our_group;
// Shuffle validators in groups:
let mut rng = thread_rng();
for g in validator_groups.iter_mut() {
g.shuffle(&mut rng)
}
// Look up `AuthorityDiscoveryId`s right away:
let validator_groups: Vec<Vec<_>> = validator_groups
.into_iter()
.map(|group| {
group
.into_iter()
.map(|index| {
discovery_keys.get(index.0 as usize)
.expect("There should be a discovery key for each validator of each validator group. qed.")
.clone()
})
.collect()
})
.collect();
let info = SessionInfo {
validator_groups,
our_index,
session_index,
our_group,
node_features,
};
return Ok(Some(info));
}
return Ok(None);
}
}
@@ -0,0 +1,322 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use futures::FutureExt;
use std::future::Future;
use pezkuwi_node_network_protocol::request_response::ReqProtocolNames;
use pezkuwi_node_primitives::{BlockData, ErasureChunk, PoV};
use pezkuwi_node_subsystem_util::runtime::RuntimeInfo;
use pezkuwi_primitives::{
BlockNumber, ChunkIndex, CoreState, ExecutorParams, GroupIndex, Hash, Id as ParaId,
ScheduledCore, SessionIndex, SessionInfo,
};
use sp_core::{testing::TaskExecutor, traits::SpawnNamed};
use pezkuwi_node_subsystem::{
messages::{
AllMessages, AvailabilityDistributionMessage, AvailabilityStoreMessage, ChainApiMessage,
NetworkBridgeTxMessage, RuntimeApiMessage, RuntimeApiRequest,
},
ActiveLeavesUpdate, SpawnGlue,
};
use pezkuwi_node_subsystem_test_helpers::{
make_subsystem_context,
mock::{make_ferdie_keystore, new_leaf},
TestSubsystemContext, TestSubsystemContextHandle,
};
use crate::tests::{
mock::{get_valid_chunk_data, make_session_info, OccupiedCoreBuilder},
node_features_with_mapping_enabled,
};
use super::Requester;
fn get_erasure_chunk() -> ErasureChunk {
let pov = PoV { block_data: BlockData(vec![45, 46, 47]) };
get_valid_chunk_data(pov, 10, ChunkIndex(0)).1
}
#[derive(Clone)]
struct TestState {
/// Simulated relay chain heads. For each block except genesis
/// there exists a single corresponding candidate, handled in [`spawn_virtual_overseer`].
pub relay_chain: Vec<Hash>,
pub session_info: SessionInfo,
// Defines a way to compute a session index for the block with
// a given number. Returns 1 for all blocks by default.
pub session_index_for_block: fn(BlockNumber) -> SessionIndex,
}
impl TestState {
fn new() -> Self {
let relay_chain: Vec<_> = (0u8..10).map(Hash::repeat_byte).collect();
let session_info = make_session_info();
let session_index_for_block = |_| 1;
Self { relay_chain, session_info, session_index_for_block }
}
}
fn spawn_virtual_overseer(
pool: TaskExecutor,
test_state: TestState,
mut ctx_handle: TestSubsystemContextHandle<AvailabilityDistributionMessage>,
) {
pool.spawn(
"virtual-overseer",
None,
async move {
loop {
let msg = ctx_handle.try_recv().await;
if msg.is_none() {
break;
}
match msg.unwrap() {
AllMessages::NetworkBridgeTx(NetworkBridgeTxMessage::SendRequests(..)) => {},
AllMessages::AvailabilityStore(AvailabilityStoreMessage::QueryChunk(
..,
tx,
)) => {
let chunk = get_erasure_chunk();
tx.send(Some(chunk)).expect("Receiver is expected to be alive");
},
AllMessages::AvailabilityStore(AvailabilityStoreMessage::StoreChunk {
tx,
..
}) => {
// Silently accept it.
tx.send(Ok(())).expect("Receiver is expected to be alive");
},
AllMessages::RuntimeApi(RuntimeApiMessage::Request(hash, req)) => {
match req {
RuntimeApiRequest::SessionIndexForChild(tx) => {
let chain = &test_state.relay_chain;
let block_number = chain
.iter()
.position(|h| *h == hash)
.expect("Invalid session index request");
// Compute session index.
let session_index_for_block = test_state.session_index_for_block;
tx.send(Ok(session_index_for_block(block_number as u32 + 1)))
.expect("Receiver should still be alive");
},
RuntimeApiRequest::SessionInfo(_, tx) => {
tx.send(Ok(Some(test_state.session_info.clone())))
.expect("Receiver should be alive.");
},
RuntimeApiRequest::SessionExecutorParams(_, tx) => {
tx.send(Ok(Some(ExecutorParams::default())))
.expect("Receiver should be alive.");
},
RuntimeApiRequest::NodeFeatures(_, tx) => {
tx.send(Ok(node_features_with_mapping_enabled()))
.expect("Receiver should be alive.");
},
RuntimeApiRequest::AvailabilityCores(tx) => {
let para_id = ParaId::from(1_u32);
let maybe_block_position =
test_state.relay_chain.iter().position(|h| *h == hash);
let cores = match maybe_block_position {
Some(block_num) => {
let core = if block_num == 0 {
CoreState::Scheduled(ScheduledCore {
para_id,
collator: None,
})
} else {
CoreState::Occupied(
OccupiedCoreBuilder {
group_responsible: GroupIndex(1),
para_id,
relay_parent: hash,
n_validators: 10,
chunk_index: ChunkIndex(0),
}
.build()
.0,
)
};
vec![core]
},
None => Vec::new(),
};
tx.send(Ok(cores)).expect("Receiver should be alive.")
},
_ => {
panic!("Unexpected runtime request: {:?}", req);
},
}
},
AllMessages::ChainApi(ChainApiMessage::Ancestors {
hash,
k,
response_channel,
}) => {
let chain = &test_state.relay_chain;
let maybe_block_position = chain.iter().position(|h| *h == hash);
let ancestors = maybe_block_position
.map(|idx| chain[..idx].iter().rev().take(k).copied().collect())
.unwrap_or_default();
response_channel
.send(Ok(ancestors))
.expect("Receiver is expected to be alive");
},
msg => panic!("Unexpected overseer message: {:?}", msg),
}
}
}
.boxed(),
);
}
fn test_harness<T: Future<Output = ()>>(
test_state: TestState,
test_fx: impl FnOnce(
TestSubsystemContext<AvailabilityDistributionMessage, SpawnGlue<TaskExecutor>>,
) -> T,
) {
let pool = TaskExecutor::new();
let (ctx, ctx_handle) = make_subsystem_context(pool.clone());
spawn_virtual_overseer(pool, test_state, ctx_handle);
futures::executor::block_on(test_fx(ctx));
}
#[test]
fn check_ancestry_lookup_in_same_session() {
let test_state = TestState::new();
let mut requester =
Requester::new(ReqProtocolNames::new(&Hash::repeat_byte(0xff), None), Default::default());
let keystore = make_ferdie_keystore();
let mut runtime = RuntimeInfo::new(Some(keystore));
test_harness(test_state.clone(), |mut ctx| async move {
let chain = &test_state.relay_chain;
let block_number = 1;
let update = ActiveLeavesUpdate {
activated: Some(new_leaf(chain[block_number], block_number as u32)),
deactivated: Vec::new().into(),
};
requester
.update_fetching_heads(&mut ctx, &mut runtime, update)
.await
.expect("Leaf processing failed");
let fetch_tasks = &requester.fetches;
assert_eq!(fetch_tasks.len(), 1);
let block_1_candidate =
*fetch_tasks.keys().next().expect("A task is checked to be present; qed");
let block_number = 2;
let update = ActiveLeavesUpdate {
activated: Some(new_leaf(chain[block_number], block_number as u32)),
deactivated: Vec::new().into(),
};
requester
.update_fetching_heads(&mut ctx, &mut runtime, update)
.await
.expect("Leaf processing failed");
let fetch_tasks = &requester.fetches;
assert_eq!(fetch_tasks.len(), 2);
let task = fetch_tasks.get(&block_1_candidate).expect("Leaf hasn't been deactivated yet");
// The task should be live in both blocks 1 and 2.
assert_eq!(task.live_in.len(), 2);
let block_2_candidate = *fetch_tasks
.keys()
.find(|hash| **hash != block_1_candidate)
.expect("Two tasks are present, the first one corresponds to block 1 candidate; qed");
// Deactivate both blocks but keep the second task as a
// part of ancestry.
let block_number = 2 + Requester::LEAF_ANCESTRY_LEN_WITHIN_SESSION;
let update = ActiveLeavesUpdate {
activated: Some(new_leaf(chain[block_number], block_number as u32)),
deactivated: vec![chain[1], chain[2]].into(),
};
requester
.update_fetching_heads(&mut ctx, &mut runtime, update)
.await
.expect("Leaf processing failed");
let fetch_tasks = &requester.fetches;
// The leaf + K its ancestors.
assert_eq!(fetch_tasks.len(), Requester::LEAF_ANCESTRY_LEN_WITHIN_SESSION + 1);
let block_2_task = fetch_tasks
.get(&block_2_candidate)
.expect("Expected to be live as a part of ancestry");
assert_eq!(block_2_task.live_in.len(), 1);
});
}
#[test]
fn check_ancestry_lookup_in_different_sessions() {
let mut test_state = TestState::new();
let mut requester =
Requester::new(ReqProtocolNames::new(&Hash::repeat_byte(0xff), None), Default::default());
let keystore = make_ferdie_keystore();
let mut runtime = RuntimeInfo::new(Some(keystore));
test_state.session_index_for_block = |block_number| match block_number {
0..=3 => 1,
_ => 2,
};
test_harness(test_state.clone(), |mut ctx| async move {
let chain = &test_state.relay_chain;
let block_number = 3;
let update = ActiveLeavesUpdate {
activated: Some(new_leaf(chain[block_number], block_number as u32)),
deactivated: Vec::new().into(),
};
requester
.update_fetching_heads(&mut ctx, &mut runtime, update)
.await
.expect("Leaf processing failed");
let fetch_tasks = &requester.fetches;
assert_eq!(fetch_tasks.len(), 3.min(Requester::LEAF_ANCESTRY_LEN_WITHIN_SESSION + 1));
let block_number = 4;
let update = ActiveLeavesUpdate {
activated: Some(new_leaf(chain[block_number], block_number as u32)),
deactivated: vec![chain[1], chain[2], chain[3]].into(),
};
requester
.update_fetching_heads(&mut ctx, &mut runtime, update)
.await
.expect("Leaf processing failed");
let fetch_tasks = &requester.fetches;
assert_eq!(fetch_tasks.len(), 1);
let block_number = 5;
let update = ActiveLeavesUpdate {
activated: Some(new_leaf(chain[block_number], block_number as u32)),
deactivated: vec![chain[4]].into(),
};
requester
.update_fetching_heads(&mut ctx, &mut runtime, update)
.await
.expect("Leaf processing failed");
let fetch_tasks = &requester.fetches;
assert_eq!(fetch_tasks.len(), 2.min(Requester::LEAF_ANCESTRY_LEN_WITHIN_SESSION + 1));
});
}
@@ -0,0 +1,296 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//! Answer requests for availability chunks.
use std::sync::Arc;
use futures::{channel::oneshot, select, FutureExt};
use codec::{Decode, Encode};
use fatality::Nested;
use pezkuwi_node_network_protocol::{
request_response::{v1, v2, IncomingRequest, IncomingRequestReceiver, IsRequest},
UnifiedReputationChange as Rep,
};
use pezkuwi_node_primitives::{AvailableData, ErasureChunk};
use pezkuwi_node_subsystem::{messages::AvailabilityStoreMessage, SubsystemSender};
use pezkuwi_primitives::{CandidateHash, ValidatorIndex};
use crate::{
error::{JfyiError, Result},
metrics::{Metrics, FAILED, NOT_FOUND, SUCCEEDED},
LOG_TARGET,
};
const COST_INVALID_REQUEST: Rep = Rep::CostMajor("Received message could not be decoded.");
/// Receiver task to be forked as a separate task to handle PoV requests.
pub async fn run_pov_receiver<Sender>(
mut sender: Sender,
mut receiver: IncomingRequestReceiver<v1::PoVFetchingRequest>,
metrics: Metrics,
) where
Sender: SubsystemSender<AvailabilityStoreMessage>,
{
loop {
match receiver.recv(|| vec![COST_INVALID_REQUEST]).await.into_nested() {
Ok(Ok(msg)) => {
answer_pov_request_log(&mut sender, msg, &metrics).await;
},
Err(fatal) => {
gum::debug!(
target: LOG_TARGET,
error = ?fatal,
"Shutting down POV receiver."
);
return;
},
Ok(Err(jfyi)) => {
gum::debug!(target: LOG_TARGET, error = ?jfyi, "Error decoding incoming PoV request.");
},
}
}
}
/// Receiver task to be forked as a separate task to handle chunk requests.
pub async fn run_chunk_receivers<Sender>(
mut sender: Sender,
mut receiver_v1: IncomingRequestReceiver<v1::ChunkFetchingRequest>,
mut receiver_v2: IncomingRequestReceiver<v2::ChunkFetchingRequest>,
metrics: Metrics,
) where
Sender: SubsystemSender<AvailabilityStoreMessage>,
{
let make_resp_v1 = |chunk: Option<ErasureChunk>| match chunk {
None => v1::ChunkFetchingResponse::NoSuchChunk,
Some(chunk) => v1::ChunkFetchingResponse::Chunk(chunk.into()),
};
let make_resp_v2 = |chunk: Option<ErasureChunk>| match chunk {
None => v2::ChunkFetchingResponse::NoSuchChunk,
Some(chunk) => v2::ChunkFetchingResponse::Chunk(chunk.into()),
};
loop {
select! {
res = receiver_v1.recv(|| vec![COST_INVALID_REQUEST]).fuse() => match res.into_nested() {
Ok(Ok(msg)) => {
answer_chunk_request_log(&mut sender, msg, make_resp_v1, &metrics).await;
},
Err(fatal) => {
gum::debug!(
target: LOG_TARGET,
error = ?fatal,
"Shutting down chunk receiver."
);
return
},
Ok(Err(jfyi)) => {
gum::debug!(
target: LOG_TARGET,
error = ?jfyi,
"Error decoding incoming chunk request."
);
}
},
res = receiver_v2.recv(|| vec![COST_INVALID_REQUEST]).fuse() => match res.into_nested() {
Ok(Ok(msg)) => {
answer_chunk_request_log(&mut sender, msg.into(), make_resp_v2, &metrics).await;
},
Err(fatal) => {
gum::debug!(
target: LOG_TARGET,
error = ?fatal,
"Shutting down chunk receiver."
);
return
},
Ok(Err(jfyi)) => {
gum::debug!(
target: LOG_TARGET,
error = ?jfyi,
"Error decoding incoming chunk request."
);
}
}
}
}
}
/// Variant of `answer_pov_request` that does Prometheus metric and logging on errors.
///
/// Any errors of `answer_pov_request` will simply be logged.
pub async fn answer_pov_request_log<Sender>(
sender: &mut Sender,
req: IncomingRequest<v1::PoVFetchingRequest>,
metrics: &Metrics,
) where
Sender: SubsystemSender<AvailabilityStoreMessage>,
{
let res = answer_pov_request(sender, req).await;
match res {
Ok(result) => metrics.on_served_pov(if result { SUCCEEDED } else { NOT_FOUND }),
Err(err) => {
gum::warn!(
target: LOG_TARGET,
err= ?err,
"Serving PoV failed with error"
);
metrics.on_served_pov(FAILED);
},
}
}
/// Variant of `answer_chunk_request` that does Prometheus metric and logging on errors.
///
/// Any errors of `answer_request` will simply be logged.
pub async fn answer_chunk_request_log<Sender, Req, MakeResp>(
sender: &mut Sender,
req: IncomingRequest<Req>,
make_response: MakeResp,
metrics: &Metrics,
) where
Req: IsRequest + Decode + Encode + Into<v1::ChunkFetchingRequest>,
Req::Response: Encode,
Sender: SubsystemSender<AvailabilityStoreMessage>,
MakeResp: Fn(Option<ErasureChunk>) -> Req::Response,
{
let res = answer_chunk_request(sender, req, make_response).await;
match res {
Ok(result) => metrics.on_served_chunk(if result { SUCCEEDED } else { NOT_FOUND }),
Err(err) => {
gum::warn!(
target: LOG_TARGET,
err= ?err,
"Serving chunk failed with error"
);
metrics.on_served_chunk(FAILED);
},
}
}
/// Answer an incoming PoV fetch request by querying the av store.
///
/// Returns: `Ok(true)` if chunk was found and served.
pub async fn answer_pov_request<Sender>(
sender: &mut Sender,
req: IncomingRequest<v1::PoVFetchingRequest>,
) -> Result<bool>
where
Sender: SubsystemSender<AvailabilityStoreMessage>,
{
let av_data = query_available_data(sender, req.payload.candidate_hash).await?;
let result = av_data.is_some();
let response = match av_data {
None => v1::PoVFetchingResponse::NoSuchPoV,
Some(av_data) => {
let pov = Arc::try_unwrap(av_data.pov).unwrap_or_else(|a| (&*a).clone());
v1::PoVFetchingResponse::PoV(pov)
},
};
req.send_response(response).map_err(|_| JfyiError::SendResponse)?;
Ok(result)
}
/// Answer an incoming chunk request by querying the av store.
///
/// Returns: `Ok(true)` if chunk was found and served.
pub async fn answer_chunk_request<Sender, Req, MakeResp>(
sender: &mut Sender,
req: IncomingRequest<Req>,
make_response: MakeResp,
) -> Result<bool>
where
Sender: SubsystemSender<AvailabilityStoreMessage>,
Req: IsRequest + Decode + Encode + Into<v1::ChunkFetchingRequest>,
Req::Response: Encode,
MakeResp: Fn(Option<ErasureChunk>) -> Req::Response,
{
// V1 and V2 requests have the same payload, so decoding into either one will work. It's the
// responses that differ, hence the `MakeResp` generic.
let payload: v1::ChunkFetchingRequest = req.payload.into();
let chunk = query_chunk(sender, payload.candidate_hash, payload.index).await?;
let result = chunk.is_some();
gum::trace!(
target: LOG_TARGET,
hash = ?payload.candidate_hash,
index = ?payload.index,
peer = ?req.peer,
has_data = ?chunk.is_some(),
"Serving chunk",
);
let response = make_response(chunk);
req.pending_response
.send_response(response)
.map_err(|_| JfyiError::SendResponse)?;
Ok(result)
}
/// Query chunk from the availability store.
async fn query_chunk<Sender>(
sender: &mut Sender,
candidate_hash: CandidateHash,
validator_index: ValidatorIndex,
) -> std::result::Result<Option<ErasureChunk>, JfyiError>
where
Sender: SubsystemSender<AvailabilityStoreMessage>,
{
let (tx, rx) = oneshot::channel();
sender
.send_message(
AvailabilityStoreMessage::QueryChunk(candidate_hash, validator_index, tx).into(),
)
.await;
let result = rx.await.map_err(|e| {
gum::trace!(
target: LOG_TARGET,
?validator_index,
?candidate_hash,
error = ?e,
"Error retrieving chunk",
);
JfyiError::QueryChunkResponseChannel(e)
})?;
Ok(result)
}
/// Query PoV from the availability store.
async fn query_available_data<Sender>(
sender: &mut Sender,
candidate_hash: CandidateHash,
) -> Result<Option<AvailableData>>
where
Sender: SubsystemSender<AvailabilityStoreMessage>,
{
let (tx, rx) = oneshot::channel();
sender
.send_message(AvailabilityStoreMessage::QueryAvailableData(candidate_hash, tx).into())
.await;
let result = rx.await.map_err(JfyiError::QueryAvailableDataResponseChannel)?;
Ok(result)
}
@@ -0,0 +1,166 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//! Helper functions and tools to generate mock data useful for testing this subsystem.
use std::sync::Arc;
use sp_keyring::Sr25519Keyring;
use pezkuwi_erasure_coding::{branches, obtain_chunks_v1 as obtain_chunks};
use pezkuwi_node_primitives::{AvailableData, BlockData, ErasureChunk, PoV, Proof};
use pezkuwi_primitives::{
CandidateCommitments, CandidateHash, ChunkIndex, CommittedCandidateReceiptV2, GroupIndex, Hash,
HeadData, Id as ParaId, IndexedVec, OccupiedCore, PersistedValidationData, SessionInfo,
ValidatorIndex,
};
use pezkuwi_primitives_test_helpers::{
dummy_collator, dummy_collator_signature, dummy_hash, dummy_validation_code,
CandidateDescriptor, CommittedCandidateReceipt,
};
/// Create dummy session info with two validator groups.
pub fn make_session_info() -> SessionInfo {
let validators = vec![
Sr25519Keyring::Ferdie, // <- this node, role: validator
Sr25519Keyring::Alice,
Sr25519Keyring::Bob,
Sr25519Keyring::Charlie,
Sr25519Keyring::Dave,
Sr25519Keyring::Eve,
Sr25519Keyring::One,
];
let validator_groups: IndexedVec<GroupIndex, Vec<ValidatorIndex>> =
[vec![5, 0, 3], vec![1, 6, 2, 4]]
.iter()
.map(|g| g.into_iter().map(|v| ValidatorIndex(*v)).collect())
.collect();
SessionInfo {
discovery_keys: validators.iter().map(|k| k.public().into()).collect(),
// Not used:
n_cores: validator_groups.len() as u32,
validator_groups,
// Not used values:
validators: validators.iter().map(|k| k.public().into()).collect(),
assignment_keys: Vec::new(),
zeroth_delay_tranche_width: 0,
relay_vrf_modulo_samples: 0,
n_delay_tranches: 0,
no_show_slots: 0,
needed_approvals: 0,
active_validator_indices: Vec::new(),
dispute_period: 6,
random_seed: [0u8; 32],
}
}
/// Builder for constructing occupied cores.
///
/// Takes all the values we care about and fills the rest with dummy values on `build`.
pub struct OccupiedCoreBuilder {
pub group_responsible: GroupIndex,
pub para_id: ParaId,
pub relay_parent: Hash,
pub n_validators: usize,
pub chunk_index: ChunkIndex,
}
impl OccupiedCoreBuilder {
pub fn build(self) -> (OccupiedCore, (CandidateHash, ErasureChunk)) {
let pov = PoV { block_data: BlockData(vec![45, 46, 47]) };
let pov_hash = pov.hash();
let (erasure_root, chunk) =
get_valid_chunk_data(pov.clone(), self.n_validators, self.chunk_index);
let candidate_receipt = TestCandidateBuilder {
para_id: self.para_id,
pov_hash,
relay_parent: self.relay_parent,
erasure_root,
..Default::default()
}
.build();
let core = OccupiedCore {
next_up_on_available: None,
occupied_since: 0,
time_out_at: 0,
next_up_on_time_out: None,
availability: Default::default(),
group_responsible: self.group_responsible,
candidate_hash: candidate_receipt.hash(),
candidate_descriptor: candidate_receipt.descriptor.clone(),
};
(core, (candidate_receipt.hash(), chunk))
}
}
#[derive(Default)]
pub struct TestCandidateBuilder {
para_id: ParaId,
head_data: HeadData,
pov_hash: Hash,
relay_parent: Hash,
erasure_root: Hash,
}
impl TestCandidateBuilder {
pub fn build(self) -> CommittedCandidateReceiptV2 {
CommittedCandidateReceipt {
descriptor: CandidateDescriptor {
para_id: self.para_id,
pov_hash: self.pov_hash,
relay_parent: self.relay_parent,
erasure_root: self.erasure_root,
collator: dummy_collator(),
persisted_validation_data_hash: dummy_hash(),
signature: dummy_collator_signature(),
para_head: dummy_hash(),
validation_code_hash: dummy_validation_code().hash(),
},
commitments: CandidateCommitments { head_data: self.head_data, ..Default::default() },
}
.into()
}
}
// Get chunk for index 0
pub fn get_valid_chunk_data(
pov: PoV,
n_validators: usize,
chunk_index: ChunkIndex,
) -> (Hash, ErasureChunk) {
let persisted = PersistedValidationData {
parent_head: HeadData(vec![7, 8, 9]),
relay_parent_number: Default::default(),
max_pov_size: 1024,
relay_parent_storage_root: Default::default(),
};
let available_data = AvailableData { validation_data: persisted, pov: Arc::new(pov) };
let chunks = obtain_chunks(n_validators, &available_data).unwrap();
let branches = branches(chunks.as_ref());
let root = branches.root();
let chunk = branches
.enumerate()
.map(|(index, (proof, chunk))| ErasureChunk {
chunk: chunk.to_vec(),
index: ChunkIndex(index as _),
proof: Proof::try_from(proof).unwrap(),
})
.nth(chunk_index.0 as usize)
.expect("There really should be enough chunks.");
(root, chunk)
}
@@ -0,0 +1,201 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use std::collections::HashSet;
use futures::{executor, future, Future};
use rstest::rstest;
use pezkuwi_node_network_protocol::request_response::{
IncomingRequest, Protocol, ReqProtocolNames,
};
use pezkuwi_primitives::{node_features, Block, CoreState, Hash, NodeFeatures};
use sp_keystore::KeystorePtr;
use super::*;
mod state;
/// State for test harnesses.
use state::{TestHarness, TestState};
/// Mock data useful for testing.
pub(crate) mod mock;
fn test_harness<T: Future<Output = ()>>(
keystore: KeystorePtr,
req_protocol_names: ReqProtocolNames,
test_fx: impl FnOnce(TestHarness) -> T,
) -> std::result::Result<(), FatalError> {
sp_tracing::init_for_tests();
let pool = sp_core::testing::TaskExecutor::new();
let (context, virtual_overseer) =
pezkuwi_node_subsystem_test_helpers::make_subsystem_context(pool.clone());
let (pov_req_receiver, _pov_req_cfg) = IncomingRequest::get_config_receiver::<
Block,
sc_network::NetworkWorker<Block, Hash>,
>(&req_protocol_names);
let (chunk_req_v1_receiver, chunk_req_v1_cfg) = IncomingRequest::get_config_receiver::<
Block,
sc_network::NetworkWorker<Block, Hash>,
>(&req_protocol_names);
let (chunk_req_v2_receiver, chunk_req_v2_cfg) = IncomingRequest::get_config_receiver::<
Block,
sc_network::NetworkWorker<Block, Hash>,
>(&req_protocol_names);
let subsystem = AvailabilityDistributionSubsystem::new(
keystore,
IncomingRequestReceivers { pov_req_receiver, chunk_req_v1_receiver, chunk_req_v2_receiver },
req_protocol_names,
Default::default(),
);
let subsystem = subsystem.run(context);
let test_fut =
test_fx(TestHarness { virtual_overseer, chunk_req_v1_cfg, chunk_req_v2_cfg, pool });
futures::pin_mut!(test_fut);
futures::pin_mut!(subsystem);
executor::block_on(future::join(test_fut, subsystem)).1
}
pub fn node_features_with_mapping_enabled() -> NodeFeatures {
let mut node_features = NodeFeatures::new();
node_features.resize(node_features::FeatureIndex::AvailabilityChunkMapping as usize + 1, false);
node_features.set(node_features::FeatureIndex::AvailabilityChunkMapping as u8 as usize, true);
node_features
}
/// Simple basic check, whether the subsystem works as expected.
///
/// Exceptional cases are tested as unit tests in `fetch_task`.
#[rstest]
#[case(NodeFeatures::EMPTY, Protocol::ChunkFetchingV1)]
#[case(NodeFeatures::EMPTY, Protocol::ChunkFetchingV2)]
#[case(node_features_with_mapping_enabled(), Protocol::ChunkFetchingV1)]
#[case(node_features_with_mapping_enabled(), Protocol::ChunkFetchingV2)]
fn check_basic(#[case] node_features: NodeFeatures, #[case] chunk_resp_protocol: Protocol) {
let req_protocol_names = ReqProtocolNames::new(&Hash::repeat_byte(0xff), None);
let state =
TestState::new(node_features.clone(), req_protocol_names.clone(), chunk_resp_protocol);
if node_features == node_features_with_mapping_enabled() &&
chunk_resp_protocol == Protocol::ChunkFetchingV1
{
// For this specific case, chunk fetching is not possible, because the ValidatorIndex is not
// equal to the ChunkIndex and the peer does not send back the actual ChunkIndex.
let _ = test_harness(state.keystore.clone(), req_protocol_names, move |harness| {
state.run_assert_timeout(harness)
});
} else {
test_harness(state.keystore.clone(), req_protocol_names, move |harness| state.run(harness))
.unwrap();
}
}
/// Check whether requester tries all validators in group.
#[rstest]
#[case(NodeFeatures::EMPTY, Protocol::ChunkFetchingV1)]
#[case(NodeFeatures::EMPTY, Protocol::ChunkFetchingV2)]
#[case(node_features_with_mapping_enabled(), Protocol::ChunkFetchingV1)]
#[case(node_features_with_mapping_enabled(), Protocol::ChunkFetchingV2)]
fn check_fetch_tries_all(
#[case] node_features: NodeFeatures,
#[case] chunk_resp_protocol: Protocol,
) {
let req_protocol_names = ReqProtocolNames::new(&Hash::repeat_byte(0xff), None);
let mut state =
TestState::new(node_features.clone(), req_protocol_names.clone(), chunk_resp_protocol);
for (_, v) in state.chunks.iter_mut() {
// 4 validators in group, so this should still succeed:
v.push(None);
v.push(None);
v.push(None);
}
if node_features == node_features_with_mapping_enabled() &&
chunk_resp_protocol == Protocol::ChunkFetchingV1
{
// For this specific case, chunk fetching is not possible, because the ValidatorIndex is not
// equal to the ChunkIndex and the peer does not send back the actual ChunkIndex.
let _ = test_harness(state.keystore.clone(), req_protocol_names, move |harness| {
state.run_assert_timeout(harness)
});
} else {
test_harness(state.keystore.clone(), req_protocol_names, move |harness| state.run(harness))
.unwrap();
}
}
/// Check whether requester tries all validators in group
///
/// Check that requester will retry the fetch on error on the next block still pending
/// availability.
#[rstest]
#[case(NodeFeatures::EMPTY, Protocol::ChunkFetchingV1)]
#[case(NodeFeatures::EMPTY, Protocol::ChunkFetchingV2)]
#[case(node_features_with_mapping_enabled(), Protocol::ChunkFetchingV1)]
#[case(node_features_with_mapping_enabled(), Protocol::ChunkFetchingV2)]
fn check_fetch_retry(#[case] node_features: NodeFeatures, #[case] chunk_resp_protocol: Protocol) {
let req_protocol_names = ReqProtocolNames::new(&Hash::repeat_byte(0xff), None);
let mut state =
TestState::new(node_features.clone(), req_protocol_names.clone(), chunk_resp_protocol);
state
.cores
.insert(state.relay_chain[2], state.cores.get(&state.relay_chain[1]).unwrap().clone());
// We only care about the first three blocks.
// 1. scheduled
// 2. occupied
// 3. still occupied
state.relay_chain.truncate(3);
// Get rid of unused valid chunks:
let valid_candidate_hashes: HashSet<_> = state
.cores
.get(&state.relay_chain[1])
.iter()
.flat_map(|v| v.iter())
.filter_map(|c| match c {
CoreState::Occupied(core) => Some(core.candidate_hash),
_ => None,
})
.collect();
state.valid_chunks.retain(|(ch, _)| valid_candidate_hashes.contains(ch));
for (_, v) in state.chunks.iter_mut() {
// This should still succeed as cores are still pending availability on next block.
v.push(None);
v.push(None);
v.push(None);
v.push(None);
v.push(None);
}
if node_features == node_features_with_mapping_enabled() &&
chunk_resp_protocol == Protocol::ChunkFetchingV1
{
// For this specific case, chunk fetching is not possible, because the ValidatorIndex is not
// equal to the ChunkIndex and the peer does not send back the actual ChunkIndex.
let _ = test_harness(state.keystore.clone(), req_protocol_names, move |harness| {
state.run_assert_timeout(harness)
});
} else {
test_harness(state.keystore.clone(), req_protocol_names, move |harness| state.run(harness))
.unwrap();
}
}
@@ -0,0 +1,450 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use std::{
collections::{HashMap, HashSet},
time::Duration,
};
use network::{request_responses::OutgoingResponse, ProtocolName, RequestFailure};
use pezkuwi_node_subsystem_test_helpers::TestSubsystemContextHandle;
use pezkuwi_node_subsystem_util::{availability_chunks::availability_chunk_index, TimeoutExt};
use futures::{
channel::{mpsc, oneshot},
FutureExt, SinkExt, StreamExt,
};
use futures_timer::Delay;
use sc_network as network;
use sc_network::{config as netconfig, config::RequestResponseConfig, IfDisconnected};
use sp_core::{testing::TaskExecutor, traits::SpawnNamed};
use sp_keystore::KeystorePtr;
use pezkuwi_node_network_protocol::request_response::{
v1, v2, IncomingRequest, OutgoingRequest, Protocol, ReqProtocolNames, Requests,
};
use pezkuwi_node_primitives::ErasureChunk;
use pezkuwi_node_subsystem::{
messages::{
AllMessages, AvailabilityDistributionMessage, AvailabilityStoreMessage, ChainApiMessage,
NetworkBridgeTxMessage, RuntimeApiMessage, RuntimeApiRequest,
},
ActiveLeavesUpdate, FromOrchestra, OverseerSignal,
};
use pezkuwi_node_subsystem_test_helpers as test_helpers;
use pezkuwi_primitives::{
CandidateHash, ChunkIndex, CoreIndex, CoreState, ExecutorParams, GroupIndex, Hash,
Id as ParaId, NodeFeatures, ScheduledCore, SessionInfo, ValidatorIndex,
};
use test_helpers::mock::{make_ferdie_keystore, new_leaf};
use super::mock::{make_session_info, OccupiedCoreBuilder};
use crate::LOG_TARGET;
type VirtualOverseer = pezkuwi_node_subsystem_test_helpers::TestSubsystemContextHandle<
AvailabilityDistributionMessage,
>;
pub struct TestHarness {
pub virtual_overseer: VirtualOverseer,
pub chunk_req_v1_cfg: RequestResponseConfig,
pub chunk_req_v2_cfg: RequestResponseConfig,
pub pool: TaskExecutor,
}
/// `TestState` for mocking execution of this subsystem.
///
/// The `Default` instance provides data, which makes the system succeed by providing a couple of
/// valid occupied cores. You can tune the data before calling `TestState::run`. E.g. modify some
/// chunks to be invalid, the test will then still pass if you remove that chunk from
/// `valid_chunks`.
#[derive(Clone)]
pub struct TestState {
/// Simulated relay chain heads:
pub relay_chain: Vec<Hash>,
/// Whenever the subsystem tries to fetch an erasure chunk one item of the given vec will be
/// popped. So you can experiment with serving invalid chunks or no chunks on request and see
/// whether the subsystem still succeeds with its goal.
pub chunks: HashMap<(CandidateHash, ValidatorIndex), Vec<Option<ErasureChunk>>>,
/// All chunks that are valid and should be accepted.
pub valid_chunks: HashSet<(CandidateHash, ValidatorIndex)>,
pub session_info: SessionInfo,
/// Cores per relay chain block.
pub cores: HashMap<Hash, Vec<CoreState>>,
pub keystore: KeystorePtr,
pub node_features: NodeFeatures,
pub chunk_response_protocol: Protocol,
pub req_protocol_names: ReqProtocolNames,
pub our_chunk_index: ChunkIndex,
}
impl TestState {
/// Initialize a default test state.
pub fn new(
node_features: NodeFeatures,
req_protocol_names: ReqProtocolNames,
chunk_response_protocol: Protocol,
) -> Self {
let relay_chain: Vec<_> = (1u8..10).map(Hash::repeat_byte).collect();
let chain_a = ParaId::from(1);
let chain_b = ParaId::from(2);
let chain_ids = vec![chain_a, chain_b];
let keystore = make_ferdie_keystore();
let session_info = make_session_info();
let our_chunk_index = availability_chunk_index(
&node_features,
session_info.validators.len(),
CoreIndex(1),
ValidatorIndex(0),
)
.unwrap();
let (cores, chunks) = {
let mut cores = HashMap::new();
let mut chunks = HashMap::new();
cores.insert(
relay_chain[0],
vec![
CoreState::Scheduled(ScheduledCore { para_id: chain_ids[0], collator: None }),
CoreState::Scheduled(ScheduledCore { para_id: chain_ids[1], collator: None }),
],
);
let heads = {
let mut advanced = relay_chain.iter();
advanced.next();
relay_chain.iter().zip(advanced)
};
for (relay_parent, relay_child) in heads {
let (p_cores, p_chunks): (Vec<_>, Vec<_>) = chain_ids
.iter()
.enumerate()
.map(|(i, para_id)| {
let (core, chunk) = OccupiedCoreBuilder {
group_responsible: GroupIndex(i as _),
para_id: *para_id,
relay_parent: *relay_parent,
n_validators: session_info.validators.len(),
chunk_index: our_chunk_index,
}
.build();
(CoreState::Occupied(core), chunk)
})
.unzip();
cores.insert(*relay_child, p_cores);
// Skip chunks for our own group (won't get fetched):
let mut chunks_other_groups = p_chunks.into_iter();
chunks_other_groups.next();
for (candidate, chunk) in chunks_other_groups {
chunks.insert((candidate, ValidatorIndex(0)), vec![Some(chunk)]);
}
}
(cores, chunks)
};
Self {
relay_chain,
valid_chunks: chunks.clone().keys().map(Clone::clone).collect(),
chunks,
session_info,
cores,
keystore,
node_features,
chunk_response_protocol,
req_protocol_names,
our_chunk_index,
}
}
/// Run, but fail after some timeout.
pub async fn run(self, harness: TestHarness) {
// Make sure test won't run forever.
let f = self.run_inner(harness).timeout(Duration::from_secs(5));
assert!(f.await.is_some(), "Test ran into timeout");
}
/// Run, and assert an expected timeout.
pub async fn run_assert_timeout(self, harness: TestHarness) {
// Make sure test won't run forever.
let f = self.run_inner(harness).timeout(Duration::from_secs(5));
assert!(f.await.is_none(), "Test should have run into timeout");
}
/// Run tests with the given mock values in `TestState`.
///
/// This will simply advance through the simulated chain and examines whether the subsystem
/// behaves as expected: It will succeed if all valid chunks of other backing groups get stored
/// and no other.
///
/// We try to be as agnostic about details as possible, how the subsystem achieves those goals
/// should not be a matter to this test suite.
async fn run_inner(mut self, mut harness: TestHarness) {
// We skip genesis here (in reality ActiveLeavesUpdate can also skip a block):
let updates = {
let mut advanced = self.relay_chain.iter();
advanced.next();
self.relay_chain
.iter()
.zip(advanced)
.map(|(old, new)| ActiveLeavesUpdate {
activated: Some(new_leaf(*new, 1)),
deactivated: vec![*old].into(),
})
.collect::<Vec<_>>()
};
// We should be storing all valid chunks during execution:
//
// Test will fail if this does not happen until timeout.
let mut remaining_stores = self.valid_chunks.len();
let TestSubsystemContextHandle { tx, mut rx, .. } = harness.virtual_overseer;
// Spawning necessary as incoming queue can only hold a single item, we don't want to dead
// lock ;-)
let update_tx = tx.clone();
harness.pool.spawn(
"sending-active-leaves-updates",
None,
async move {
for update in updates {
overseer_signal(update_tx.clone(), OverseerSignal::ActiveLeaves(update)).await;
// We need to give the subsystem a little time to do its job, otherwise it will
// cancel jobs as obsolete:
Delay::new(Duration::from_millis(100)).await;
}
}
.boxed(),
);
while remaining_stores > 0 {
gum::trace!(target: LOG_TARGET, remaining_stores, "Stores left to go");
let msg = overseer_recv(&mut rx).await;
match msg {
AllMessages::NetworkBridgeTx(NetworkBridgeTxMessage::SendRequests(
reqs,
IfDisconnected::ImmediateError,
)) => {
for req in reqs {
// Forward requests:
match self.chunk_response_protocol {
Protocol::ChunkFetchingV1 => {
let in_req = to_incoming_req_v1(
&harness.pool,
req,
self.req_protocol_names.get_name(Protocol::ChunkFetchingV1),
);
harness
.chunk_req_v1_cfg
.inbound_queue
.as_mut()
.unwrap()
.send(in_req.into_raw())
.await
.unwrap();
},
Protocol::ChunkFetchingV2 => {
let in_req = to_incoming_req_v2(
&harness.pool,
req,
self.req_protocol_names.get_name(Protocol::ChunkFetchingV2),
);
harness
.chunk_req_v2_cfg
.inbound_queue
.as_mut()
.unwrap()
.send(in_req.into_raw())
.await
.unwrap();
},
_ => panic!("Unexpected protocol"),
}
}
},
AllMessages::AvailabilityStore(AvailabilityStoreMessage::QueryChunk(
candidate_hash,
validator_index,
tx,
)) => {
let chunk = self
.chunks
.get_mut(&(candidate_hash, validator_index))
.and_then(Vec::pop)
.flatten();
tx.send(chunk).expect("Receiver is expected to be alive");
},
AllMessages::AvailabilityStore(AvailabilityStoreMessage::StoreChunk {
candidate_hash,
chunk,
validator_index,
tx,
..
}) => {
assert!(
self.valid_chunks.contains(&(candidate_hash, validator_index)),
"Only valid chunks should ever get stored."
);
assert_eq!(self.our_chunk_index, chunk.index);
tx.send(Ok(())).expect("Receiver is expected to be alive");
gum::trace!(target: LOG_TARGET, "'Stored' fetched chunk.");
remaining_stores -= 1;
},
AllMessages::RuntimeApi(RuntimeApiMessage::Request(hash, req)) => {
match req {
RuntimeApiRequest::SessionIndexForChild(tx) => {
// Always session index 1 for now:
tx.send(Ok(1)).expect("Receiver should still be alive");
},
RuntimeApiRequest::SessionInfo(_, tx) => {
tx.send(Ok(Some(self.session_info.clone())))
.expect("Receiver should be alive.");
},
RuntimeApiRequest::SessionExecutorParams(_, tx) => {
tx.send(Ok(Some(ExecutorParams::default())))
.expect("Receiver should be alive.");
},
RuntimeApiRequest::AvailabilityCores(tx) => {
gum::trace!(target: LOG_TARGET, cores= ?self.cores[&hash], hash = ?hash, "Sending out cores for hash");
tx.send(Ok(self.cores[&hash].clone()))
.expect("Receiver should still be alive");
},
RuntimeApiRequest::NodeFeatures(_, tx) => {
tx.send(Ok(self.node_features.clone()))
.expect("Receiver should still be alive");
},
_ => {
panic!("Unexpected runtime request: {:?}", req);
},
}
},
AllMessages::ChainApi(ChainApiMessage::Ancestors { hash, k, response_channel }) => {
let chain = &self.relay_chain;
let maybe_block_position = chain.iter().position(|h| *h == hash);
let ancestors = maybe_block_position
.map(|idx| chain[..idx].iter().rev().take(k).copied().collect())
.unwrap_or_default();
response_channel.send(Ok(ancestors)).expect("Receiver is expected to be alive");
},
_ => {
panic!("Received unexpected message")
},
}
}
overseer_signal(tx, OverseerSignal::Conclude).await;
}
}
async fn overseer_signal(
mut tx: mpsc::Sender<FromOrchestra<AvailabilityDistributionMessage>>,
msg: impl Into<OverseerSignal>,
) {
let msg = msg.into();
gum::trace!(target: LOG_TARGET, msg = ?msg, "sending message");
tx.send(FromOrchestra::Signal(msg))
.await
.expect("Test subsystem no longer live");
}
async fn overseer_recv(rx: &mut mpsc::UnboundedReceiver<AllMessages>) -> AllMessages {
gum::trace!(target: LOG_TARGET, "waiting for message ...");
rx.next().await.expect("Test subsystem no longer live")
}
fn to_incoming_req_v1(
executor: &TaskExecutor,
outgoing: Requests,
protocol_name: ProtocolName,
) -> IncomingRequest<v1::ChunkFetchingRequest> {
match outgoing {
Requests::ChunkFetching(OutgoingRequest {
pending_response,
fallback_request: Some((fallback_request, fallback_protocol)),
..
}) => {
assert_eq!(fallback_protocol, Protocol::ChunkFetchingV1);
let tx = spawn_message_forwarding(executor, protocol_name, pending_response);
IncomingRequest::new(
// We don't really care:
network::PeerId::random().into(),
fallback_request,
tx,
)
},
_ => panic!("Unexpected request!"),
}
}
fn to_incoming_req_v2(
executor: &TaskExecutor,
outgoing: Requests,
protocol_name: ProtocolName,
) -> IncomingRequest<v2::ChunkFetchingRequest> {
match outgoing {
Requests::ChunkFetching(OutgoingRequest {
payload,
pending_response,
fallback_request: Some((_, fallback_protocol)),
..
}) => {
assert_eq!(fallback_protocol, Protocol::ChunkFetchingV1);
let tx = spawn_message_forwarding(executor, protocol_name, pending_response);
IncomingRequest::new(
// We don't really care:
network::PeerId::random().into(),
payload,
tx,
)
},
_ => panic!("Unexpected request!"),
}
}
fn spawn_message_forwarding(
executor: &TaskExecutor,
protocol_name: ProtocolName,
pending_response: oneshot::Sender<Result<(Vec<u8>, ProtocolName), RequestFailure>>,
) -> oneshot::Sender<OutgoingResponse> {
let (tx, rx): (oneshot::Sender<netconfig::OutgoingResponse>, oneshot::Receiver<_>) =
oneshot::channel();
executor.spawn(
"message-forwarding",
None,
async {
let response = rx.await;
let payload = response.expect("Unexpected canceled request").result;
pending_response
.send(payload.map_err(|_| RequestFailure::Refused).map(|r| (r, protocol_name)))
.expect("Sending response is expected to work");
}
.boxed(),
);
tx
}
@@ -0,0 +1,67 @@
[package]
name = "pezkuwi-availability-recovery"
description = "The Availability Recovery subsystem. Handles requests for recovering the availability data of included candidates."
version = "7.0.0"
authors.workspace = true
edition.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
[lints]
workspace = true
[[bench]]
name = "availability-recovery-regression-bench"
path = "benches/availability-recovery-regression-bench.rs"
harness = false
required-features = ["subsystem-benchmarks"]
[dependencies]
async-trait = { workspace = true }
fatality = { workspace = true }
futures = { workspace = true }
gum = { workspace = true, default-features = true }
rand = { workspace = true, default-features = true }
schnellru = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true, default-features = true }
codec = { features = ["derive"], workspace = true }
pezkuwi-erasure-coding = { workspace = true, default-features = true }
pezkuwi-node-network-protocol = { workspace = true, default-features = true }
pezkuwi-node-primitives = { workspace = true, default-features = true }
pezkuwi-node-subsystem = { workspace = true, default-features = true }
pezkuwi-node-subsystem-util = { workspace = true, default-features = true }
pezkuwi-primitives = { workspace = true, default-features = true }
sc-network = { workspace = true, default-features = true }
[dev-dependencies]
assert_matches = { workspace = true }
futures-timer = { workspace = true }
rstest = { workspace = true }
sp-core = { workspace = true, default-features = true }
sp-keyring = { workspace = true, default-features = true }
sp-tracing = { workspace = true, default-features = true }
pezkuwi-node-subsystem-test-helpers = { workspace = true }
pezkuwi-primitives-test-helpers = { workspace = true }
pezkuwi-subsystem-bench = { workspace = true }
[features]
subsystem-benchmarks = []
runtime-benchmarks = [
"gum/runtime-benchmarks",
"pezkuwi-erasure-coding/runtime-benchmarks",
"pezkuwi-node-network-protocol/runtime-benchmarks",
"pezkuwi-node-primitives/runtime-benchmarks",
"pezkuwi-node-subsystem-test-helpers/runtime-benchmarks",
"pezkuwi-node-subsystem-util/runtime-benchmarks",
"pezkuwi-node-subsystem/runtime-benchmarks",
"pezkuwi-primitives-test-helpers/runtime-benchmarks",
"pezkuwi-primitives/runtime-benchmarks",
"pezkuwi-subsystem-bench/runtime-benchmarks",
"sc-network/runtime-benchmarks",
"sp-keyring/runtime-benchmarks",
]
@@ -0,0 +1,81 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//! availability-read regression tests
//!
//! Availability read benchmark based on Kusama parameters and scale.
//!
//! Subsystems involved:
//! - availability-recovery
use pezkuwi_subsystem_bench::{
availability::{
benchmark_availability_read, prepare_test, DataAvailabilityReadOptions, Strategy,
TestDataAvailability, TestState,
},
configuration::TestConfiguration,
usage::BenchmarkUsage,
utils::save_to_file,
};
use std::io::Write;
const BENCH_COUNT: usize = 10;
fn main() -> Result<(), String> {
let mut messages = vec![];
let options = DataAvailabilityReadOptions { strategy: Strategy::FullFromBackers };
let mut config = TestConfiguration::default();
config.num_blocks = 3;
config.generate_pov_sizes();
let state = TestState::new(&config);
println!("Benchmarking...");
let usages: Vec<BenchmarkUsage> = (0..BENCH_COUNT)
.map(|n| {
print!("\r[{}{}]", "#".repeat(n), "_".repeat(BENCH_COUNT - n));
std::io::stdout().flush().unwrap();
let (mut env, _cfgs) =
prepare_test(&state, TestDataAvailability::Read(options.clone()), false);
env.runtime().block_on(benchmark_availability_read(&mut env, &state))
})
.collect();
println!("\rDone!{}", " ".repeat(BENCH_COUNT));
let average_usage = BenchmarkUsage::average(&usages);
save_to_file(
"charts/availability-recovery-regression-bench.json",
average_usage.to_chart_json().map_err(|e| e.to_string())?,
)
.map_err(|e| e.to_string())?;
println!("{}", average_usage);
// We expect no variance for received and sent
// but use 0.001 because we operate with floats
messages.extend(average_usage.check_network_usage(&[
("Received from peers", 307203.0000, 0.001),
("Sent to peers", 1.6667, 0.001),
]));
messages.extend(average_usage.check_cpu_usage(&[("availability-recovery", 11.2758, 0.1)]));
if messages.is_empty() {
Ok(())
} else {
eprintln!("{}", messages.join("\n"));
Err("Regressions found".to_string())
}
}
@@ -0,0 +1,91 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//! The `Error` and `Result` types used by the subsystem.
use crate::LOG_TARGET;
use fatality::{fatality, Nested};
use futures::channel::oneshot;
use pezkuwi_node_network_protocol::request_response::incoming;
use pezkuwi_node_subsystem::{RecoveryError, SubsystemError};
use pezkuwi_primitives::Hash;
/// Error type used by the Availability Recovery subsystem.
#[fatality(splitable)]
pub enum Error {
#[fatal]
#[error("Spawning subsystem task failed: {0}")]
SpawnTask(#[source] SubsystemError),
/// Receiving subsystem message from overseer failed.
#[fatal]
#[error("Receiving message from overseer failed: {0}")]
SubsystemReceive(#[source] SubsystemError),
#[fatal]
#[error("failed to query full data from store")]
CanceledQueryFullData(#[source] oneshot::Canceled),
#[error("`SessionInfo` is `None` at {0}")]
SessionInfoUnavailable(Hash),
#[error("failed to query node features from runtime")]
RequestNodeFeatures(#[source] pezkuwi_node_subsystem_util::runtime::Error),
#[error("failed to send response")]
CanceledResponseSender,
#[error(transparent)]
Runtime(#[from] pezkuwi_node_subsystem::errors::RuntimeApiError),
#[error(transparent)]
Erasure(#[from] pezkuwi_erasure_coding::Error),
#[fatal]
#[error(transparent)]
Oneshot(#[from] oneshot::Canceled),
#[fatal(forward)]
#[error("Error during recovery: {0}")]
Recovery(#[from] RecoveryError),
#[fatal(forward)]
#[error("Retrieving next incoming request failed: {0}")]
IncomingRequest(#[from] incoming::Error),
}
pub type Result<T> = std::result::Result<T, Error>;
/// Utility for eating top level errors and log them.
///
/// We basically always want to try and continue on error, unless the error is fatal for the entire
/// subsystem.
pub fn log_error(result: Result<()>) -> std::result::Result<(), FatalError> {
match result.into_nested()? {
Ok(()) => Ok(()),
Err(jfyi) => {
jfyi.log();
Ok(())
},
}
}
impl JfyiError {
/// Log a `JfyiError`.
pub fn log(self) {
gum::warn!(target: LOG_TARGET, "{}", self);
}
}
@@ -0,0 +1,236 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//! FuturesUndead: A `FuturesUnordered` with support for semi canceled futures. Those undead
//! futures will still get polled, but will not count towards length. So length will only count
//! futures, which are still considered live.
//!
//! Use case: If futures take longer than we would like them too, we may be able to request the data
//! from somewhere else as well. We don't really want to cancel the old future, because maybe it
//! was almost done, thus we would have wasted time with our impatience. By simply making them
//! not count towards length, we can make sure to have enough "live" requests ongoing, while at the
//! same time taking advantage of some maybe "late" response from the undead.
use std::{
pin::Pin,
task::{Context, Poll},
time::Duration,
};
use futures::{future::BoxFuture, stream::FuturesUnordered, Future, Stream, StreamExt};
use pezkuwi_node_subsystem_util::TimeoutExt;
/// FuturesUndead - `FuturesUnordered` with semi canceled (undead) futures.
///
/// Limitations: Keeps track of undead futures by means of a counter, which is limited to 64
/// bits, so after `1.8*10^19` pushed futures, this implementation will panic.
pub struct FuturesUndead<Output> {
/// Actual `FuturesUnordered`.
inner: FuturesUnordered<Undead<Output>>,
/// Next sequence number to assign to the next future that gets pushed.
next_sequence: SequenceNumber,
/// Sequence number of first future considered live.
first_live: Option<SequenceNumber>,
/// How many undead are there right now.
undead: usize,
}
/// All futures get a number, to determine which are live.
#[derive(Eq, PartialEq, Copy, Clone, Debug, PartialOrd)]
struct SequenceNumber(usize);
struct Undead<Output> {
inner: BoxFuture<'static, Output>,
our_sequence: SequenceNumber,
}
impl<Output> FuturesUndead<Output> {
pub fn new() -> Self {
Self {
inner: FuturesUnordered::new(),
next_sequence: SequenceNumber(0),
first_live: None,
undead: 0,
}
}
pub fn push(&mut self, f: BoxFuture<'static, Output>) {
self.inner.push(Undead { inner: f, our_sequence: self.next_sequence });
self.next_sequence.inc();
}
/// Make all contained futures undead.
///
/// They will no longer be counted on a call to `len`.
pub fn soft_cancel(&mut self) {
self.undead = self.inner.len();
self.first_live = Some(self.next_sequence);
}
/// Number of contained futures minus undead.
pub fn len(&self) -> usize {
self.inner.len() - self.undead
}
/// Total number of futures, including undead.
pub fn total_len(&self) -> usize {
self.inner.len()
}
/// Wait for next future to return with timeout.
///
/// When timeout passes, return `None` and make all currently contained futures undead.
pub async fn next_with_timeout(&mut self, timeout: Duration) -> Option<Output> {
match self.next().timeout(timeout).await {
// Timeout:
None => {
self.soft_cancel();
None
},
Some(inner) => inner,
}
}
}
impl<Output> Stream for FuturesUndead<Output> {
type Item = Output;
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
match self.inner.poll_next_unpin(cx) {
Poll::Pending => Poll::Pending,
Poll::Ready(None) => Poll::Ready(None),
Poll::Ready(Some((sequence, v))) => {
// Cleanup in case we became completely empty:
if self.inner.len() == 0 {
*self = Self::new();
return Poll::Ready(Some(v));
}
let first_live = match self.first_live {
None => return Poll::Ready(Some(v)),
Some(first_live) => first_live,
};
// An undead came back:
if sequence < first_live {
self.undead = self.undead.saturating_sub(1);
}
Poll::Ready(Some(v))
},
}
}
}
impl SequenceNumber {
pub fn inc(&mut self) {
self.0 = self.0.checked_add(1).expect(
"We don't expect an `UndeadFuture` to live long enough for 2^64 entries ever getting inserted."
);
}
}
impl<T> Future for Undead<T> {
type Output = (SequenceNumber, T);
fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
match self.inner.as_mut().poll(cx) {
Poll::Pending => Poll::Pending,
Poll::Ready(v) => Poll::Ready((self.our_sequence, v)),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use futures::{executor, pending, FutureExt};
#[test]
fn cancel_sets_len_to_zero() {
let mut undead = FuturesUndead::new();
undead.push((async { () }).boxed());
assert_eq!(undead.len(), 1);
undead.soft_cancel();
assert_eq!(undead.len(), 0);
}
#[test]
fn finished_undead_does_not_change_len() {
executor::block_on(async {
let mut undead = FuturesUndead::new();
undead.push(async { 1_i32 }.boxed());
undead.push(async { 2_i32 }.boxed());
assert_eq!(undead.len(), 2);
undead.soft_cancel();
assert_eq!(undead.len(), 0);
undead.push(
async {
pending!();
0_i32
}
.boxed(),
);
undead.next().await;
assert_eq!(undead.len(), 1);
undead.push(async { 9_i32 }.boxed());
undead.soft_cancel();
assert_eq!(undead.len(), 0);
});
}
#[test]
fn len_stays_correct_when_live_future_ends() {
executor::block_on(async {
let mut undead = FuturesUndead::new();
undead.push(
async {
pending!();
1_i32
}
.boxed(),
);
undead.push(
async {
pending!();
2_i32
}
.boxed(),
);
assert_eq!(undead.len(), 2);
undead.soft_cancel();
assert_eq!(undead.len(), 0);
undead.push(async { 0_i32 }.boxed());
undead.push(async { 1_i32 }.boxed());
undead.next().await;
assert_eq!(undead.len(), 1);
undead.next().await;
assert_eq!(undead.len(), 0);
undead.push(async { 9_i32 }.boxed());
assert_eq!(undead.len(), 1);
});
}
#[test]
fn cleanup_works() {
executor::block_on(async {
let mut undead = FuturesUndead::new();
undead.push(async { 1_i32 }.boxed());
undead.soft_cancel();
undead.push(async { 2_i32 }.boxed());
undead.next().await;
undead.next().await;
assert_eq!(undead.first_live, None);
});
}
}
@@ -0,0 +1,925 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//! Availability Recovery Subsystem of Pezkuwi.
#![warn(missing_docs)]
use std::{
collections::{BTreeMap, VecDeque},
iter::Iterator,
num::NonZeroUsize,
pin::Pin,
};
use futures::{
channel::oneshot,
future::{Future, FutureExt, RemoteHandle},
pin_mut,
prelude::*,
sink::SinkExt,
stream::{FuturesUnordered, StreamExt},
task::{Context, Poll},
};
use sc_network::ProtocolName;
use schnellru::{ByLength, LruMap};
use task::{
FetchChunks, FetchChunksParams, FetchFull, FetchFullParams, FetchSystematicChunks,
FetchSystematicChunksParams,
};
use pezkuwi_erasure_coding::{
branches, obtain_chunks_v1, recovery_threshold, systematic_recovery_threshold,
Error as ErasureEncodingError,
};
use task::{RecoveryParams, RecoveryStrategy, RecoveryTask};
use error::{log_error, Error, FatalError, Result};
use pezkuwi_node_network_protocol::{
request_response::{
v1 as request_v1, v2 as request_v2, IncomingRequestReceiver, IsRequest, ReqProtocolNames,
},
UnifiedReputationChange as Rep,
};
use pezkuwi_node_primitives::AvailableData;
use pezkuwi_node_subsystem::{
errors::RecoveryError,
messages::{AvailabilityRecoveryMessage, AvailabilityStoreMessage},
overseer, ActiveLeavesUpdate, FromOrchestra, OverseerSignal, SpawnedSubsystem,
SubsystemContext, SubsystemError,
};
use pezkuwi_node_subsystem_util::{
availability_chunks::availability_chunk_indices,
runtime::{ExtendedSessionInfo, RuntimeInfo},
};
use pezkuwi_primitives::{
node_features, BlockNumber, CandidateHash, CandidateReceiptV2 as CandidateReceipt, ChunkIndex,
CoreIndex, GroupIndex, Hash, SessionIndex, ValidatorIndex,
};
mod error;
mod futures_undead;
mod metrics;
mod task;
pub use metrics::Metrics;
#[cfg(test)]
mod tests;
type RecoveryResult = std::result::Result<AvailableData, RecoveryError>;
const LOG_TARGET: &str = "teyrchain::availability-recovery";
// Size of the LRU cache where we keep recovered data.
const LRU_SIZE: u32 = 16;
const COST_INVALID_REQUEST: Rep = Rep::CostMajor("Peer sent unparsable request");
/// PoV size limit in bytes for which prefer fetching from backers. (conservative, Pezkuwi for now)
pub(crate) const CONSERVATIVE_FETCH_CHUNKS_THRESHOLD: usize = 1 * 1024 * 1024;
/// PoV size limit in bytes for which prefer fetching from backers. (Kusama and all testnets)
pub const FETCH_CHUNKS_THRESHOLD: usize = 4 * 1024 * 1024;
#[derive(Clone, PartialEq)]
/// The strategy we use to recover the PoV.
pub enum RecoveryStrategyKind {
/// We try the backing group first if PoV size is lower than specified, then fallback to
/// validator chunks.
BackersFirstIfSizeLower(usize),
/// We try the backing group first if PoV size is lower than specified, then fallback to
/// systematic chunks. Regular chunk recovery as a last resort.
BackersFirstIfSizeLowerThenSystematicChunks(usize),
/// The following variants are only helpful for integration tests.
///
/// We always try the backing group first, then fallback to validator chunks.
#[allow(dead_code)]
BackersFirstAlways,
/// We always recover using validator chunks.
#[allow(dead_code)]
ChunksAlways,
/// First try the backing group. Then systematic chunks.
#[allow(dead_code)]
BackersThenSystematicChunks,
/// Always recover using systematic chunks, fall back to regular chunks.
#[allow(dead_code)]
SystematicChunks,
}
/// The Availability Recovery Subsystem.
pub struct AvailabilityRecoverySubsystem {
/// PoV recovery strategy to use.
recovery_strategy_kind: RecoveryStrategyKind,
// If this is true, do not request data from the availability store.
/// This is the useful for nodes where the
/// availability-store subsystem is not expected to run,
/// such as collators.
bypass_availability_store: bool,
/// Receiver for available data requests.
req_receiver: IncomingRequestReceiver<request_v1::AvailableDataFetchingRequest>,
/// Metrics for this subsystem.
metrics: Metrics,
/// The type of check to perform after available data was recovered.
post_recovery_check: PostRecoveryCheck,
/// Full protocol name for ChunkFetchingV1.
req_v1_protocol_name: ProtocolName,
/// Full protocol name for ChunkFetchingV2.
req_v2_protocol_name: ProtocolName,
}
#[derive(Clone, PartialEq, Debug)]
/// The type of check to perform after available data was recovered.
enum PostRecoveryCheck {
/// Reencode the data and check erasure root. For validators.
Reencode,
/// Only check the pov hash. For collators only.
PovHash,
}
/// Expensive erasure coding computations that we want to run on a blocking thread.
enum ErasureTask {
/// Reconstructs `AvailableData` from chunks given `n_validators`.
Reconstruct(
usize,
BTreeMap<ChunkIndex, Vec<u8>>,
oneshot::Sender<std::result::Result<AvailableData, ErasureEncodingError>>,
),
/// Re-encode `AvailableData` into erasure chunks in order to verify the provided root hash of
/// the Merkle tree.
Reencode(usize, Hash, AvailableData, oneshot::Sender<Option<AvailableData>>),
}
/// Re-encode the data into erasure chunks in order to verify
/// the root hash of the provided Merkle tree, which is built
/// on-top of the encoded chunks.
///
/// This (expensive) check is necessary, as otherwise we can't be sure that some chunks won't have
/// been tampered with by the backers, which would result in some validators considering the data
/// valid and some invalid as having fetched different set of chunks. The checking of the Merkle
/// proof for individual chunks only gives us guarantees, that we have fetched a chunk belonging to
/// a set the backers have committed to.
///
/// NOTE: It is fine to do this check with already decoded data, because if the decoding failed for
/// some validators, we can be sure that chunks have been tampered with (by the backers) or the
/// data was invalid to begin with. In the former case, validators fetching valid chunks will see
/// invalid data as well, because the root won't match. In the latter case the situation is the
/// same for anyone anyways.
fn reconstructed_data_matches_root(
n_validators: usize,
expected_root: &Hash,
data: &AvailableData,
metrics: &Metrics,
) -> bool {
let _timer = metrics.time_reencode_chunks();
let chunks = match obtain_chunks_v1(n_validators, data) {
Ok(chunks) => chunks,
Err(e) => {
gum::debug!(
target: LOG_TARGET,
err = ?e,
"Failed to obtain chunks",
);
return false;
},
};
let branches = branches(&chunks);
branches.root() == *expected_root
}
/// Accumulate all awaiting sides for some particular `AvailableData`.
struct RecoveryHandle {
candidate_hash: CandidateHash,
remote: RemoteHandle<RecoveryResult>,
awaiting: Vec<oneshot::Sender<RecoveryResult>>,
}
impl Future for RecoveryHandle {
type Output = Option<(CandidateHash, RecoveryResult)>;
fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
let mut indices_to_remove = Vec::new();
for (i, awaiting) in self.awaiting.iter_mut().enumerate().rev() {
if let Poll::Ready(()) = awaiting.poll_canceled(cx) {
indices_to_remove.push(i);
}
}
// these are reverse order, so remove is fine.
for index in indices_to_remove {
gum::debug!(
target: LOG_TARGET,
candidate_hash = ?self.candidate_hash,
"Receiver for available data dropped.",
);
self.awaiting.swap_remove(index);
}
if self.awaiting.is_empty() {
gum::debug!(
target: LOG_TARGET,
candidate_hash = ?self.candidate_hash,
"All receivers for available data dropped.",
);
return Poll::Ready(None);
}
let remote = &mut self.remote;
futures::pin_mut!(remote);
let result = futures::ready!(remote.poll(cx));
for awaiting in self.awaiting.drain(..) {
let _ = awaiting.send(result.clone());
}
Poll::Ready(Some((self.candidate_hash, result)))
}
}
/// Cached result of an availability recovery operation.
#[derive(Debug, Clone)]
enum CachedRecovery {
/// Availability was successfully retrieved before.
Valid(AvailableData),
/// Availability was successfully retrieved before, but was found to be invalid.
Invalid,
}
impl CachedRecovery {
/// Convert back to `Result` to deliver responses.
fn into_result(self) -> RecoveryResult {
match self {
Self::Valid(d) => Ok(d),
Self::Invalid => Err(RecoveryError::Invalid),
}
}
}
impl TryFrom<RecoveryResult> for CachedRecovery {
type Error = ();
fn try_from(o: RecoveryResult) -> std::result::Result<CachedRecovery, Self::Error> {
match o {
Ok(d) => Ok(Self::Valid(d)),
Err(RecoveryError::Invalid) => Ok(Self::Invalid),
// We don't want to cache unavailable state, as that state might change, so if
// requested again we want to try again!
Err(RecoveryError::Unavailable) => Err(()),
Err(RecoveryError::ChannelClosed) => Err(()),
}
}
}
struct State {
/// Each recovery task is implemented as its own async task,
/// and these handles are for communicating with them.
ongoing_recoveries: FuturesUnordered<RecoveryHandle>,
/// A recent block hash for which state should be available.
live_block: (BlockNumber, Hash),
/// An LRU cache of recently recovered data.
availability_lru: LruMap<CandidateHash, CachedRecovery>,
/// Cached runtime info.
runtime_info: RuntimeInfo,
}
impl Default for State {
fn default() -> Self {
Self {
ongoing_recoveries: FuturesUnordered::new(),
live_block: (0, Hash::default()),
availability_lru: LruMap::new(ByLength::new(LRU_SIZE)),
runtime_info: RuntimeInfo::new(None),
}
}
}
#[overseer::subsystem(AvailabilityRecovery, error=SubsystemError, prefix=self::overseer)]
impl<Context> AvailabilityRecoverySubsystem {
fn start(self, ctx: Context) -> SpawnedSubsystem {
let future = self
.run(ctx)
.map_err(|e| SubsystemError::with_origin("availability-recovery", e))
.boxed();
SpawnedSubsystem { name: "availability-recovery-subsystem", future }
}
}
/// Handles a signal from the overseer.
/// Returns true if subsystem receives a deadly signal.
async fn handle_signal(state: &mut State, signal: OverseerSignal) -> bool {
match signal {
OverseerSignal::Conclude => true,
OverseerSignal::ActiveLeaves(ActiveLeavesUpdate { activated, .. }) => {
// if activated is non-empty, set state.live_block to the highest block in `activated`
if let Some(activated) = activated {
if activated.number > state.live_block.0 {
state.live_block = (activated.number, activated.hash)
}
}
false
},
OverseerSignal::BlockFinalized(_, _) => false,
}
}
/// Machinery around launching recovery tasks into the background.
#[overseer::contextbounds(AvailabilityRecovery, prefix = self::overseer)]
async fn launch_recovery_task<Context>(
state: &mut State,
ctx: &mut Context,
response_sender: oneshot::Sender<RecoveryResult>,
recovery_strategies: VecDeque<Box<dyn RecoveryStrategy<<Context as SubsystemContext>::Sender>>>,
params: RecoveryParams,
) -> Result<()> {
let candidate_hash = params.candidate_hash;
let recovery_task = RecoveryTask::new(ctx.sender().clone(), params, recovery_strategies);
let (remote, remote_handle) = recovery_task.run().remote_handle();
state.ongoing_recoveries.push(RecoveryHandle {
candidate_hash,
remote: remote_handle,
awaiting: vec![response_sender],
});
ctx.spawn("recovery-task", Box::pin(remote))
.map_err(|err| Error::SpawnTask(err))
}
/// Handles an availability recovery request.
#[overseer::contextbounds(AvailabilityRecovery, prefix = self::overseer)]
async fn handle_recover<Context>(
state: &mut State,
ctx: &mut Context,
receipt: CandidateReceipt,
session_index: SessionIndex,
backing_group: Option<GroupIndex>,
response_sender: oneshot::Sender<RecoveryResult>,
metrics: &Metrics,
erasure_task_tx: futures::channel::mpsc::Sender<ErasureTask>,
recovery_strategy_kind: RecoveryStrategyKind,
bypass_availability_store: bool,
post_recovery_check: PostRecoveryCheck,
maybe_core_index: Option<CoreIndex>,
req_v1_protocol_name: ProtocolName,
req_v2_protocol_name: ProtocolName,
) -> Result<()> {
let candidate_hash = receipt.hash();
if let Some(result) =
state.availability_lru.get(&candidate_hash).cloned().map(|v| v.into_result())
{
return response_sender.send(result).map_err(|_| Error::CanceledResponseSender);
}
if let Some(i) =
state.ongoing_recoveries.iter_mut().find(|i| i.candidate_hash == candidate_hash)
{
i.awaiting.push(response_sender);
return Ok(());
}
let session_info_res = state
.runtime_info
.get_session_info_by_index(ctx.sender(), state.live_block.1, session_index)
.await;
match session_info_res {
Ok(ExtendedSessionInfo { session_info, node_features, .. }) => {
let mut backer_group = None;
let n_validators = session_info.validators.len();
let systematic_threshold = systematic_recovery_threshold(n_validators)?;
let mut recovery_strategies: VecDeque<
Box<dyn RecoveryStrategy<<Context as SubsystemContext>::Sender>>,
> = VecDeque::with_capacity(3);
if let Some(backing_group) = backing_group {
if let Some(backing_validators) = session_info.validator_groups.get(backing_group) {
let mut small_pov_size = true;
match recovery_strategy_kind {
RecoveryStrategyKind::BackersFirstIfSizeLower(fetch_chunks_threshold) |
RecoveryStrategyKind::BackersFirstIfSizeLowerThenSystematicChunks(
fetch_chunks_threshold,
) => {
// Get our own chunk size to get an estimate of the PoV size.
let chunk_size: Result<Option<usize>> =
query_chunk_size(ctx, candidate_hash).await;
if let Ok(Some(chunk_size)) = chunk_size {
let pov_size_estimate = chunk_size * systematic_threshold;
small_pov_size = pov_size_estimate < fetch_chunks_threshold;
if small_pov_size {
gum::trace!(
target: LOG_TARGET,
?candidate_hash,
pov_size_estimate,
fetch_chunks_threshold,
"Prefer fetch from backing group",
);
}
} else {
// we have a POV limit but were not able to query the chunk size, so
// don't use the backing group.
small_pov_size = false;
}
},
_ => {},
};
match (&recovery_strategy_kind, small_pov_size) {
(RecoveryStrategyKind::BackersFirstAlways, _) |
(RecoveryStrategyKind::BackersFirstIfSizeLower(_), true) |
(
RecoveryStrategyKind::BackersFirstIfSizeLowerThenSystematicChunks(_),
true,
) |
(RecoveryStrategyKind::BackersThenSystematicChunks, _) =>
recovery_strategies.push_back(Box::new(FetchFull::new(
FetchFullParams { validators: backing_validators.to_vec() },
))),
_ => {},
};
backer_group = Some(backing_validators);
}
}
let chunk_mapping_enabled = if let Some(&true) = node_features
.get(usize::from(node_features::FeatureIndex::AvailabilityChunkMapping as u8))
.as_deref()
{
true
} else {
false
};
// We can only attempt systematic recovery if we received the core index of the
// candidate and chunk mapping is enabled.
if let Some(core_index) = maybe_core_index {
if matches!(
recovery_strategy_kind,
RecoveryStrategyKind::BackersThenSystematicChunks |
RecoveryStrategyKind::SystematicChunks |
RecoveryStrategyKind::BackersFirstIfSizeLowerThenSystematicChunks(_)
) && chunk_mapping_enabled
{
let chunk_indices =
availability_chunk_indices(node_features, n_validators, core_index)?;
let chunk_indices: VecDeque<_> = chunk_indices
.iter()
.enumerate()
.map(|(v_index, c_index)| {
(
*c_index,
ValidatorIndex(
u32::try_from(v_index)
.expect("validator count should not exceed u32"),
),
)
})
.collect();
// Only get the validators according to the threshold.
let validators = chunk_indices
.clone()
.into_iter()
.filter(|(c_index, _)| {
usize::try_from(c_index.0)
.expect("usize is at least u32 bytes on all modern targets.") <
systematic_threshold
})
.collect();
recovery_strategies.push_back(Box::new(FetchSystematicChunks::new(
FetchSystematicChunksParams {
validators,
backers: backer_group.map(|v| v.to_vec()).unwrap_or_else(|| vec![]),
},
)));
}
}
recovery_strategies.push_back(Box::new(FetchChunks::new(FetchChunksParams {
n_validators: session_info.validators.len(),
})));
let session_info = session_info.clone();
let n_validators = session_info.validators.len();
launch_recovery_task(
state,
ctx,
response_sender,
recovery_strategies,
RecoveryParams {
validator_authority_keys: session_info.discovery_keys.clone(),
n_validators,
threshold: recovery_threshold(n_validators)?,
systematic_threshold,
candidate_hash,
erasure_root: receipt.descriptor.erasure_root(),
metrics: metrics.clone(),
bypass_availability_store,
post_recovery_check,
pov_hash: receipt.descriptor.pov_hash(),
req_v1_protocol_name,
req_v2_protocol_name,
chunk_mapping_enabled,
erasure_task_tx,
},
)
.await
},
Err(_) => {
response_sender
.send(Err(RecoveryError::Unavailable))
.map_err(|_| Error::CanceledResponseSender)?;
Err(Error::SessionInfoUnavailable(state.live_block.1))
},
}
}
/// Queries the full `AvailableData` from av-store.
#[overseer::contextbounds(AvailabilityRecovery, prefix = self::overseer)]
async fn query_full_data<Context>(
ctx: &mut Context,
candidate_hash: CandidateHash,
) -> Result<Option<AvailableData>> {
let (tx, rx) = oneshot::channel();
ctx.send_message(AvailabilityStoreMessage::QueryAvailableData(candidate_hash, tx))
.await;
rx.await.map_err(Error::CanceledQueryFullData)
}
/// Queries a chunk from av-store.
#[overseer::contextbounds(AvailabilityRecovery, prefix = self::overseer)]
async fn query_chunk_size<Context>(
ctx: &mut Context,
candidate_hash: CandidateHash,
) -> Result<Option<usize>> {
let (tx, rx) = oneshot::channel();
ctx.send_message(AvailabilityStoreMessage::QueryChunkSize(candidate_hash, tx))
.await;
rx.await.map_err(Error::CanceledQueryFullData)
}
#[overseer::contextbounds(AvailabilityRecovery, prefix = self::overseer)]
impl AvailabilityRecoverySubsystem {
/// Create a new instance of `AvailabilityRecoverySubsystem` suitable for collator nodes,
/// which never requests the `AvailabilityStoreSubsystem` subsystem and only checks the POV hash
/// instead of reencoding the available data.
pub fn for_collator(
fetch_chunks_threshold: Option<usize>,
req_receiver: IncomingRequestReceiver<request_v1::AvailableDataFetchingRequest>,
req_protocol_names: &ReqProtocolNames,
metrics: Metrics,
) -> Self {
Self {
recovery_strategy_kind: RecoveryStrategyKind::BackersFirstIfSizeLower(
fetch_chunks_threshold.unwrap_or(CONSERVATIVE_FETCH_CHUNKS_THRESHOLD),
),
bypass_availability_store: true,
post_recovery_check: PostRecoveryCheck::PovHash,
req_receiver,
metrics,
req_v1_protocol_name: req_protocol_names
.get_name(request_v1::ChunkFetchingRequest::PROTOCOL),
req_v2_protocol_name: req_protocol_names
.get_name(request_v2::ChunkFetchingRequest::PROTOCOL),
}
}
/// Create an optimised new instance of `AvailabilityRecoverySubsystem` suitable for validator
/// nodes, which:
/// - for small POVs (over the `fetch_chunks_threshold` or the
/// `CONSERVATIVE_FETCH_CHUNKS_THRESHOLD`), it attempts full recovery from backers, if backing
/// group supplied.
/// - for large POVs, attempts systematic recovery, if core_index supplied and
/// AvailabilityChunkMapping node feature is enabled.
/// - as a last resort, attempt regular chunk recovery from all validators.
pub fn for_validator(
fetch_chunks_threshold: Option<usize>,
req_receiver: IncomingRequestReceiver<request_v1::AvailableDataFetchingRequest>,
req_protocol_names: &ReqProtocolNames,
metrics: Metrics,
) -> Self {
Self {
recovery_strategy_kind:
RecoveryStrategyKind::BackersFirstIfSizeLowerThenSystematicChunks(
fetch_chunks_threshold.unwrap_or(CONSERVATIVE_FETCH_CHUNKS_THRESHOLD),
),
bypass_availability_store: false,
post_recovery_check: PostRecoveryCheck::Reencode,
req_receiver,
metrics,
req_v1_protocol_name: req_protocol_names
.get_name(request_v1::ChunkFetchingRequest::PROTOCOL),
req_v2_protocol_name: req_protocol_names
.get_name(request_v2::ChunkFetchingRequest::PROTOCOL),
}
}
/// Customise the recovery strategy kind
/// Currently only useful for tests.
#[cfg(any(test, feature = "subsystem-benchmarks"))]
pub fn with_recovery_strategy_kind(
req_receiver: IncomingRequestReceiver<request_v1::AvailableDataFetchingRequest>,
req_protocol_names: &ReqProtocolNames,
metrics: Metrics,
recovery_strategy_kind: RecoveryStrategyKind,
) -> Self {
Self {
recovery_strategy_kind,
bypass_availability_store: false,
post_recovery_check: PostRecoveryCheck::Reencode,
req_receiver,
metrics,
req_v1_protocol_name: req_protocol_names
.get_name(request_v1::ChunkFetchingRequest::PROTOCOL),
req_v2_protocol_name: req_protocol_names
.get_name(request_v2::ChunkFetchingRequest::PROTOCOL),
}
}
/// Starts the inner subsystem loop.
pub async fn run<Context>(self, mut ctx: Context) -> std::result::Result<(), FatalError> {
let mut state = State::default();
let Self {
mut req_receiver,
metrics,
recovery_strategy_kind,
bypass_availability_store,
post_recovery_check,
req_v1_protocol_name,
req_v2_protocol_name,
} = self;
let (erasure_task_tx, erasure_task_rx) = futures::channel::mpsc::channel(16);
let mut erasure_task_rx = erasure_task_rx.fuse();
// `ThreadPoolBuilder` spawns the tasks using `spawn_blocking`. For each worker there will
// be a `mpsc` channel created. Each of these workers take the `Receiver` and poll it in an
// infinite loop. All of the sender ends of the channel are sent as a vec which we then use
// to create a `Cycle` iterator. We use this iterator to assign work in a round-robin
// fashion to the workers in the pool.
//
// How work is dispatched to the pool from the recovery tasks:
// - Once a recovery task finishes retrieving the availability data, it needs to reconstruct
// from chunks and/or
// re-encode the data which are heavy CPU computations.
// To do so it sends an `ErasureTask` to the main loop via the `erasure_task` channel, and
// waits for the results over a `oneshot` channel.
// - In the subsystem main loop we poll the `erasure_task_rx` receiver.
// - We forward the received `ErasureTask` to the `next()` sender yielded by the `Cycle`
// iterator.
// - Some worker thread handles it and sends the response over the `oneshot` channel.
// Create a thread pool with 2 workers.
let mut to_pool = ThreadPoolBuilder::build(
// Pool is guaranteed to have at least 1 worker thread.
NonZeroUsize::new(2).expect("There are 2 threads; qed"),
metrics.clone(),
&mut ctx,
)
.into_iter()
.cycle();
loop {
let recv_req = req_receiver.recv(|| vec![COST_INVALID_REQUEST]).fuse();
pin_mut!(recv_req);
let res = futures::select! {
erasure_task = erasure_task_rx.next() => {
match erasure_task {
Some(task) => {
to_pool
.next()
.expect("Pool size is `NonZeroUsize`; qed")
.send(task)
.await
.map_err(|_| RecoveryError::ChannelClosed)
},
None => {
Err(RecoveryError::ChannelClosed)
}
}.map_err(Into::into)
}
signal = ctx.recv().fuse() => {
match signal {
Ok(signal) => {
match signal {
FromOrchestra::Signal(signal) => if handle_signal(
&mut state,
signal,
).await {
gum::debug!(target: LOG_TARGET, "subsystem concluded");
return Ok(());
} else {
Ok(())
},
FromOrchestra::Communication {
msg: AvailabilityRecoveryMessage::RecoverAvailableData(
receipt,
session_index,
maybe_backing_group,
maybe_core_index,
response_sender,
)
} => handle_recover(
&mut state,
&mut ctx,
receipt,
session_index,
maybe_backing_group,
response_sender,
&metrics,
erasure_task_tx.clone(),
recovery_strategy_kind.clone(),
bypass_availability_store,
post_recovery_check.clone(),
maybe_core_index,
req_v1_protocol_name.clone(),
req_v2_protocol_name.clone(),
).await
}
},
Err(e) => Err(Error::SubsystemReceive(e))
}
}
in_req = recv_req => {
match in_req {
Ok(req) => {
if bypass_availability_store {
gum::debug!(
target: LOG_TARGET,
"Skipping request to availability-store.",
);
let _ = req.send_response(None.into());
Ok(())
} else {
match query_full_data(&mut ctx, req.payload.candidate_hash).await {
Ok(res) => {
let _ = req.send_response(res.into());
Ok(())
}
Err(e) => {
let _ = req.send_response(None.into());
Err(e)
}
}
}
}
Err(e) => Err(Error::IncomingRequest(e))
}
}
output = state.ongoing_recoveries.select_next_some() => {
let mut res = Ok(());
if let Some((candidate_hash, result)) = output {
if let Err(ref e) = result {
res = Err(Error::Recovery(e.clone()));
}
if let Ok(recovery) = CachedRecovery::try_from(result) {
state.availability_lru.insert(candidate_hash, recovery);
}
}
res
}
};
// Only bubble up fatal errors, but log all of them.
if let Err(e) = res {
log_error(Err(e))?;
}
}
}
}
// A simple thread pool implementation using `spawn_blocking` threads.
struct ThreadPoolBuilder;
const MAX_THREADS: NonZeroUsize = match NonZeroUsize::new(4) {
Some(max_threads) => max_threads,
None => panic!("MAX_THREADS must be non-zero"),
};
impl ThreadPoolBuilder {
// Creates a pool of `size` workers, where 1 <= `size` <= `MAX_THREADS`.
//
// Each worker is created by `spawn_blocking` and takes the receiver side of a channel
// while all of the senders are returned to the caller. Each worker runs `erasure_task_thread`
// that polls the `Receiver` for an `ErasureTask` which is expected to be CPU intensive. The
// larger the input (more or larger chunks/availability data), the more CPU cycles will be
// spent.
//
// For example, for 32KB PoVs, we'd expect re-encode to eat as much as 90ms and 500ms for
// 2.5MiB.
//
// After executing such a task, the worker sends the response via a provided `oneshot` sender.
//
// The caller is responsible for routing work to the workers.
#[overseer::contextbounds(AvailabilityRecovery, prefix = self::overseer)]
pub fn build<Context>(
size: NonZeroUsize,
metrics: Metrics,
ctx: &mut Context,
) -> Vec<futures::channel::mpsc::Sender<ErasureTask>> {
// At least 1 task, at most `MAX_THREADS.
let size = std::cmp::min(size, MAX_THREADS);
let mut senders = Vec::new();
for index in 0..size.into() {
let (tx, rx) = futures::channel::mpsc::channel(8);
senders.push(tx);
if let Err(e) = ctx
.spawn_blocking("erasure-task", Box::pin(erasure_task_thread(metrics.clone(), rx)))
{
gum::warn!(
target: LOG_TARGET,
err = ?e,
index,
"Failed to spawn a erasure task",
);
}
}
senders
}
}
// Handles CPU intensive operation on a dedicated blocking thread.
async fn erasure_task_thread(
metrics: Metrics,
mut ingress: futures::channel::mpsc::Receiver<ErasureTask>,
) {
loop {
match ingress.next().await {
Some(ErasureTask::Reconstruct(n_validators, chunks, sender)) => {
let _ = sender.send(pezkuwi_erasure_coding::reconstruct_v1(
n_validators,
chunks.iter().map(|(c_index, chunk)| {
(
&chunk[..],
usize::try_from(c_index.0)
.expect("usize is at least u32 bytes on all modern targets."),
)
}),
));
},
Some(ErasureTask::Reencode(n_validators, root, available_data, sender)) => {
let metrics = metrics.clone();
let maybe_data = if reconstructed_data_matches_root(
n_validators,
&root,
&available_data,
&metrics,
) {
Some(available_data)
} else {
None
};
let _ = sender.send(maybe_data);
},
None => {
gum::trace!(
target: LOG_TARGET,
"Erasure task channel closed. Node shutting down ?",
);
break;
},
}
// In benchmarks this is a very hot loop not yielding at all.
// To update CPU metrics for the task we need to yield.
#[cfg(feature = "subsystem-benchmarks")]
tokio::task::yield_now().await;
}
}
@@ -0,0 +1,409 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use pezkuwi_node_subsystem::prometheus::HistogramVec;
use pezkuwi_node_subsystem_util::metrics::{
self,
prometheus::{
self, prometheus::HistogramTimer, Counter, CounterVec, Histogram, Opts, PrometheusError,
Registry, U64,
},
};
/// Availability Distribution metrics.
#[derive(Clone, Default)]
pub struct Metrics(Option<MetricsInner>);
#[derive(Clone)]
struct MetricsInner {
/// Number of sent chunk requests.
///
/// Gets incremented on each sent chunk requests.
///
/// Split by chunk type:
/// - `regular_chunks`
/// - `systematic_chunks`
chunk_requests_issued: CounterVec<U64>,
/// Total number of bytes recovered
///
/// Gets incremented on each successful recovery
recovered_bytes_total: Counter<U64>,
/// A counter for finished chunk requests.
///
/// Split by the chunk type (`regular_chunks` or `systematic_chunks`)
///
/// Also split by result:
/// - `no_such_chunk` ... peer did not have the requested chunk
/// - `timeout` ... request timed out.
/// - `error` ... Some networking issue except timeout
/// - `invalid` ... Chunk was received, but not valid.
/// - `success`
chunk_requests_finished: CounterVec<U64>,
/// A counter for successful chunk requests, split by the network protocol version.
chunk_request_protocols: CounterVec<U64>,
/// Number of sent available data requests.
full_data_requests_issued: Counter<U64>,
/// Counter for finished available data requests.
///
/// Split by the result type:
///
/// - `no_such_data` ... peer did not have the requested data
/// - `timeout` ... request timed out.
/// - `error` ... Some networking issue except timeout
/// - `invalid` ... data was received, but not valid.
/// - `success`
full_data_requests_finished: CounterVec<U64>,
/// The duration of request to response.
///
/// Split by chunk type (`regular_chunks` or `systematic_chunks`).
time_chunk_request: HistogramVec,
/// The duration between the pure recovery and verification.
///
/// Split by recovery type (`regular_chunks`, `systematic_chunks` or `full_from_backers`).
time_erasure_recovery: HistogramVec,
/// How much time it takes to reconstruct the available data from chunks.
///
/// Split by chunk type (`regular_chunks` or `systematic_chunks`), as the algorithms are
/// different.
time_erasure_reconstruct: HistogramVec,
/// How much time it takes to re-encode the data into erasure chunks in order to verify
/// the root hash of the provided Merkle tree. See `reconstructed_data_matches_root`.
time_reencode_chunks: Histogram,
/// Time of a full recovery, including erasure decoding or until we gave
/// up.
time_full_recovery: Histogram,
/// Number of full recoveries that have been finished one way or the other.
///
/// Split by recovery `strategy_type` (`full_from_backers, systematic_chunks, regular_chunks,
/// all`). `all` is used for failed recoveries that tried all available strategies.
/// Also split by `result` type.
full_recoveries_finished: CounterVec<U64>,
/// Number of full recoveries that have been started on this subsystem.
///
/// Note: Those are only recoveries which could not get served locally already - so in other
/// words: Only real recoveries.
full_recoveries_started: Counter<U64>,
}
impl Metrics {
/// Create new dummy metrics, not reporting anything.
pub fn new_dummy() -> Self {
Metrics(None)
}
/// Increment counter for chunk requests.
pub fn on_chunk_request_issued(&self, chunk_type: &str) {
if let Some(metrics) = &self.0 {
metrics.chunk_requests_issued.with_label_values(&[chunk_type]).inc()
}
}
/// Increment counter for full data requests.
pub fn on_full_request_issued(&self) {
if let Some(metrics) = &self.0 {
metrics.full_data_requests_issued.inc()
}
}
/// A chunk request timed out.
pub fn on_chunk_request_timeout(&self, chunk_type: &str) {
if let Some(metrics) = &self.0 {
metrics
.chunk_requests_finished
.with_label_values(&[chunk_type, "timeout"])
.inc()
}
}
/// A full data request timed out.
pub fn on_full_request_timeout(&self) {
if let Some(metrics) = &self.0 {
metrics.full_data_requests_finished.with_label_values(&["timeout"]).inc()
}
}
/// A chunk request failed because validator did not have its chunk.
pub fn on_chunk_request_no_such_chunk(&self, chunk_type: &str) {
if let Some(metrics) = &self.0 {
metrics
.chunk_requests_finished
.with_label_values(&[chunk_type, "no_such_chunk"])
.inc()
}
}
/// A full data request failed because the validator did not have it.
pub fn on_full_request_no_such_data(&self) {
if let Some(metrics) = &self.0 {
metrics.full_data_requests_finished.with_label_values(&["no_such_data"]).inc()
}
}
/// A chunk request failed for some non timeout related network error.
pub fn on_chunk_request_error(&self, chunk_type: &str) {
if let Some(metrics) = &self.0 {
metrics.chunk_requests_finished.with_label_values(&[chunk_type, "error"]).inc()
}
}
/// A full data request failed for some non timeout related network error.
pub fn on_full_request_error(&self) {
if let Some(metrics) = &self.0 {
metrics.full_data_requests_finished.with_label_values(&["error"]).inc()
}
}
/// A chunk request succeeded, but was not valid.
pub fn on_chunk_request_invalid(&self, chunk_type: &str) {
if let Some(metrics) = &self.0 {
metrics
.chunk_requests_finished
.with_label_values(&[chunk_type, "invalid"])
.inc()
}
}
/// A full data request succeeded, but was not valid.
pub fn on_full_request_invalid(&self) {
if let Some(metrics) = &self.0 {
metrics.full_data_requests_finished.with_label_values(&["invalid"]).inc()
}
}
/// A chunk request succeeded.
pub fn on_chunk_request_succeeded(&self, chunk_type: &str) {
if let Some(metrics) = &self.0 {
metrics
.chunk_requests_finished
.with_label_values(&[chunk_type, "success"])
.inc()
}
}
/// A chunk response was received on the v1 protocol.
pub fn on_chunk_response_v1(&self) {
if let Some(metrics) = &self.0 {
metrics.chunk_request_protocols.with_label_values(&["v1"]).inc()
}
}
/// A chunk response was received on the v2 protocol.
pub fn on_chunk_response_v2(&self) {
if let Some(metrics) = &self.0 {
metrics.chunk_request_protocols.with_label_values(&["v2"]).inc()
}
}
/// A full data request succeeded.
pub fn on_full_request_succeeded(&self) {
if let Some(metrics) = &self.0 {
metrics.full_data_requests_finished.with_label_values(&["success"]).inc()
}
}
/// Get a timer to time request/response duration.
pub fn time_chunk_request(&self, chunk_type: &str) -> Option<HistogramTimer> {
self.0.as_ref().map(|metrics| {
metrics.time_chunk_request.with_label_values(&[chunk_type]).start_timer()
})
}
/// Get a timer to time erasure code recover.
pub fn time_erasure_recovery(&self, chunk_type: &str) -> Option<HistogramTimer> {
self.0.as_ref().map(|metrics| {
metrics.time_erasure_recovery.with_label_values(&[chunk_type]).start_timer()
})
}
/// Get a timer for available data reconstruction.
pub fn time_erasure_reconstruct(&self, chunk_type: &str) -> Option<HistogramTimer> {
self.0.as_ref().map(|metrics| {
metrics.time_erasure_reconstruct.with_label_values(&[chunk_type]).start_timer()
})
}
/// Get a timer to time chunk encoding.
pub fn time_reencode_chunks(&self) -> Option<HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.time_reencode_chunks.start_timer())
}
/// Get a timer to measure the time of the complete recovery process.
pub fn time_full_recovery(&self) -> Option<HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.time_full_recovery.start_timer())
}
/// A full recovery succeeded.
pub fn on_recovery_succeeded(&self, strategy_type: &str, bytes: usize) {
if let Some(metrics) = &self.0 {
metrics
.full_recoveries_finished
.with_label_values(&["success", strategy_type])
.inc();
metrics.recovered_bytes_total.inc_by(bytes as u64)
}
}
/// A full recovery failed (data not available).
pub fn on_recovery_failed(&self, strategy_type: &str) {
if let Some(metrics) = &self.0 {
metrics
.full_recoveries_finished
.with_label_values(&["failure", strategy_type])
.inc()
}
}
/// A full recovery failed (data was recovered, but invalid).
pub fn on_recovery_invalid(&self, strategy_type: &str) {
if let Some(metrics) = &self.0 {
metrics
.full_recoveries_finished
.with_label_values(&["invalid", strategy_type])
.inc()
}
}
/// A recover was started.
pub fn on_recovery_started(&self) {
if let Some(metrics) = &self.0 {
metrics.full_recoveries_started.inc()
}
}
}
impl metrics::Metrics for Metrics {
fn try_register(registry: &Registry) -> Result<Self, PrometheusError> {
let metrics = MetricsInner {
chunk_requests_issued: prometheus::register(
CounterVec::new(
Opts::new("pezkuwi_teyrchain_availability_recovery_chunk_requests_issued",
"Total number of issued chunk requests."),
&["type"]
)?,
registry,
)?,
full_data_requests_issued: prometheus::register(
Counter::new(
"pezkuwi_teyrchain_availability_recovery_full_data_requests_issued",
"Total number of issued full data requests.",
)?,
registry,
)?,
recovered_bytes_total: prometheus::register(
Counter::new(
"pezkuwi_teyrchain_availability_recovery_bytes_total",
"Total number of bytes recovered",
)?,
registry,
)?,
chunk_requests_finished: prometheus::register(
CounterVec::new(
Opts::new(
"pezkuwi_teyrchain_availability_recovery_chunk_requests_finished",
"Total number of chunk requests finished.",
),
&["result", "type"],
)?,
registry,
)?,
chunk_request_protocols: prometheus::register(
CounterVec::new(
Opts::new(
"pezkuwi_teyrchain_availability_recovery_chunk_request_protocols",
"Total number of successful chunk requests, mapped by the protocol version (v1 or v2).",
),
&["protocol"],
)?,
registry,
)?,
full_data_requests_finished: prometheus::register(
CounterVec::new(
Opts::new(
"pezkuwi_teyrchain_availability_recovery_full_data_requests_finished",
"Total number of full data requests finished.",
),
&["result"],
)?,
registry,
)?,
time_chunk_request: prometheus::register(
prometheus::HistogramVec::new(prometheus::HistogramOpts::new(
"pezkuwi_teyrchain_availability_recovery_time_chunk_request",
"Time spent waiting for a response to a chunk request",
), &["type"])?,
registry,
)?,
time_erasure_recovery: prometheus::register(
prometheus::HistogramVec::new(prometheus::HistogramOpts::new(
"pezkuwi_teyrchain_availability_recovery_time_erasure_recovery",
"Time spent to recover the erasure code and verify the merkle root by re-encoding as erasure chunks",
), &["type"])?,
registry,
)?,
time_erasure_reconstruct: prometheus::register(
prometheus::HistogramVec::new(prometheus::HistogramOpts::new(
"pezkuwi_teyrchain_availability_recovery_time_erasure_reconstruct",
"Time spent to reconstruct the data from chunks",
), &["type"])?,
registry,
)?,
time_reencode_chunks: prometheus::register(
prometheus::Histogram::with_opts(prometheus::HistogramOpts::new(
"pezkuwi_teyrchain_availability_reencode_chunks",
"Time spent re-encoding the data as erasure chunks",
))?,
registry,
)?,
time_full_recovery: prometheus::register(
prometheus::Histogram::with_opts(prometheus::HistogramOpts::new(
"pezkuwi_teyrchain_availability_recovery_time_total",
"Time a full recovery process took, either until failure or successful erasure decoding.",
))?,
registry,
)?,
full_recoveries_finished: prometheus::register(
CounterVec::new(
Opts::new(
"pezkuwi_teyrchain_availability_recovery_recoveries_finished",
"Total number of recoveries that finished.",
),
&["result", "strategy_type"],
)?,
registry,
)?,
full_recoveries_started: prometheus::register(
Counter::new(
"pezkuwi_teyrchain_availability_recovery_recoveries_started",
"Total number of started recoveries.",
)?,
registry,
)?,
};
Ok(Metrics(Some(metrics)))
}
}
@@ -0,0 +1,197 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//! Main recovery task logic. Runs recovery strategies.
#![warn(missing_docs)]
mod strategy;
pub use self::strategy::{
FetchChunks, FetchChunksParams, FetchFull, FetchFullParams, FetchSystematicChunks,
FetchSystematicChunksParams, RecoveryStrategy, State,
};
#[cfg(test)]
pub use self::strategy::{REGULAR_CHUNKS_REQ_RETRY_LIMIT, SYSTEMATIC_CHUNKS_REQ_RETRY_LIMIT};
use crate::{metrics::Metrics, ErasureTask, PostRecoveryCheck, LOG_TARGET};
use codec::Encode;
use pezkuwi_node_primitives::AvailableData;
use pezkuwi_node_subsystem::{messages::AvailabilityStoreMessage, overseer, RecoveryError};
use pezkuwi_primitives::{AuthorityDiscoveryId, CandidateHash, Hash};
use sc_network::ProtocolName;
use futures::channel::{mpsc, oneshot};
use std::collections::VecDeque;
/// Recovery parameters common to all strategies in a `RecoveryTask`.
#[derive(Clone)]
pub struct RecoveryParams {
/// Discovery ids of `validators`.
pub validator_authority_keys: Vec<AuthorityDiscoveryId>,
/// Number of validators.
pub n_validators: usize,
/// The number of regular chunks needed.
pub threshold: usize,
/// The number of systematic chunks needed.
pub systematic_threshold: usize,
/// A hash of the relevant candidate.
pub candidate_hash: CandidateHash,
/// The root of the erasure encoding of the candidate.
pub erasure_root: Hash,
/// Metrics to report.
pub metrics: Metrics,
/// Do not request data from availability-store. Useful for collators.
pub bypass_availability_store: bool,
/// The type of check to perform after available data was recovered.
pub post_recovery_check: PostRecoveryCheck,
/// The blake2-256 hash of the PoV.
pub pov_hash: Hash,
/// Protocol name for ChunkFetchingV1.
pub req_v1_protocol_name: ProtocolName,
/// Protocol name for ChunkFetchingV2.
pub req_v2_protocol_name: ProtocolName,
/// Whether or not chunk mapping is enabled.
pub chunk_mapping_enabled: bool,
/// Channel to the erasure task handler.
pub erasure_task_tx: mpsc::Sender<ErasureTask>,
}
/// A stateful reconstruction of availability data in reference to
/// a candidate hash.
pub struct RecoveryTask<Sender: overseer::AvailabilityRecoverySenderTrait> {
sender: Sender,
params: RecoveryParams,
strategies: VecDeque<Box<dyn RecoveryStrategy<Sender>>>,
state: State,
}
impl<Sender> RecoveryTask<Sender>
where
Sender: overseer::AvailabilityRecoverySenderTrait,
{
/// Instantiate a new recovery task.
pub fn new(
sender: Sender,
params: RecoveryParams,
strategies: VecDeque<Box<dyn RecoveryStrategy<Sender>>>,
) -> Self {
Self { sender, params, strategies, state: State::new() }
}
async fn in_availability_store(&mut self) -> Option<AvailableData> {
if !self.params.bypass_availability_store {
let (tx, rx) = oneshot::channel();
self.sender
.send_message(AvailabilityStoreMessage::QueryAvailableData(
self.params.candidate_hash,
tx,
))
.await;
match rx.await {
Ok(Some(data)) => return Some(data),
Ok(None) => {},
Err(oneshot::Canceled) => {
gum::warn!(
target: LOG_TARGET,
candidate_hash = ?self.params.candidate_hash,
"Failed to reach the availability store",
)
},
}
}
None
}
/// Run this recovery task to completion. It will loop through the configured strategies
/// in-order and return whenever the first one recovers the full `AvailableData`.
pub async fn run(mut self) -> Result<AvailableData, RecoveryError> {
if let Some(data) = self.in_availability_store().await {
return Ok(data);
}
self.params.metrics.on_recovery_started();
let _timer = self.params.metrics.time_full_recovery();
while let Some(current_strategy) = self.strategies.pop_front() {
let display_name = current_strategy.display_name();
let strategy_type = current_strategy.strategy_type();
gum::debug!(
target: LOG_TARGET,
candidate_hash = ?self.params.candidate_hash,
"Starting `{}` strategy",
display_name
);
let res = current_strategy.run(&mut self.state, &mut self.sender, &self.params).await;
match res {
Err(RecoveryError::Unavailable) =>
if self.strategies.front().is_some() {
gum::debug!(
target: LOG_TARGET,
candidate_hash = ?self.params.candidate_hash,
"Recovery strategy `{}` did not conclude. Trying the next one.",
display_name
);
continue;
},
Err(err) => {
match &err {
RecoveryError::Invalid =>
self.params.metrics.on_recovery_invalid(strategy_type),
_ => self.params.metrics.on_recovery_failed(strategy_type),
}
return Err(err);
},
Ok(data) => {
self.params.metrics.on_recovery_succeeded(strategy_type, data.encoded_size());
return Ok(data);
},
}
}
// We have no other strategies to try.
gum::warn!(
target: LOG_TARGET,
candidate_hash = ?self.params.candidate_hash,
"Recovery of available data failed.",
);
self.params.metrics.on_recovery_failed("all");
Err(RecoveryError::Unavailable)
}
}
@@ -0,0 +1,334 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use crate::{
futures_undead::FuturesUndead,
task::{
strategy::{
do_post_recovery_check, is_unavailable, OngoingRequests, N_PARALLEL,
REGULAR_CHUNKS_REQ_RETRY_LIMIT,
},
RecoveryParams, State,
},
ErasureTask, RecoveryStrategy, LOG_TARGET,
};
use pezkuwi_node_primitives::AvailableData;
use pezkuwi_node_subsystem::{overseer, RecoveryError};
use pezkuwi_primitives::ValidatorIndex;
use futures::{channel::oneshot, SinkExt};
use rand::seq::SliceRandom;
use std::collections::VecDeque;
/// Parameters specific to the `FetchChunks` strategy.
pub struct FetchChunksParams {
pub n_validators: usize,
}
/// `RecoveryStrategy` that requests chunks from validators, in parallel.
pub struct FetchChunks {
/// How many requests have been unsuccessful so far.
error_count: usize,
/// Total number of responses that have been received, including failed ones.
total_received_responses: usize,
/// A shuffled array of validator indices.
validators: VecDeque<ValidatorIndex>,
/// Collection of in-flight requests.
requesting_chunks: OngoingRequests,
}
impl FetchChunks {
/// Instantiate a new strategy.
pub fn new(params: FetchChunksParams) -> Self {
// Shuffle the validators to make sure that we don't request chunks from the same
// validators over and over.
let mut validators: VecDeque<ValidatorIndex> =
(0..params.n_validators).map(|i| ValidatorIndex(i as u32)).collect();
validators.make_contiguous().shuffle(&mut rand::thread_rng());
Self {
error_count: 0,
total_received_responses: 0,
validators,
requesting_chunks: FuturesUndead::new(),
}
}
fn is_unavailable(
unrequested_validators: usize,
in_flight_requests: usize,
chunk_count: usize,
threshold: usize,
) -> bool {
is_unavailable(chunk_count, in_flight_requests, unrequested_validators, threshold)
}
/// Desired number of parallel requests.
///
/// For the given threshold (total required number of chunks) get the desired number of
/// requests we want to have running in parallel at this time.
fn get_desired_request_count(&self, chunk_count: usize, threshold: usize) -> usize {
// Upper bound for parallel requests.
// We want to limit this, so requests can be processed within the timeout and we limit the
// following feedback loop:
// 1. Requests fail due to timeout
// 2. We request more chunks to make up for it
// 3. Bandwidth is spread out even more, so we get even more timeouts
// 4. We request more chunks to make up for it ...
let max_requests_boundary = std::cmp::min(N_PARALLEL, threshold);
// How many chunks are still needed?
let remaining_chunks = threshold.saturating_sub(chunk_count);
// What is the current error rate, so we can make up for it?
let inv_error_rate =
self.total_received_responses.checked_div(self.error_count).unwrap_or(0);
// Actual number of requests we want to have in flight in parallel:
std::cmp::min(
max_requests_boundary,
remaining_chunks + remaining_chunks.checked_div(inv_error_rate).unwrap_or(0),
)
}
async fn attempt_recovery<Sender: overseer::AvailabilityRecoverySenderTrait>(
&mut self,
state: &mut State,
common_params: &RecoveryParams,
) -> Result<AvailableData, RecoveryError> {
let recovery_duration =
common_params
.metrics
.time_erasure_recovery(RecoveryStrategy::<Sender>::strategy_type(self));
// Send request to reconstruct available data from chunks.
let (avilable_data_tx, available_data_rx) = oneshot::channel();
let mut erasure_task_tx = common_params.erasure_task_tx.clone();
erasure_task_tx
.send(ErasureTask::Reconstruct(
common_params.n_validators,
// Safe to leave an empty vec in place, as we're stopping the recovery process if
// this reconstruct fails.
std::mem::take(&mut state.received_chunks)
.into_iter()
.map(|(c_index, chunk)| (c_index, chunk.chunk))
.collect(),
avilable_data_tx,
))
.await
.map_err(|_| RecoveryError::ChannelClosed)?;
let available_data_response =
available_data_rx.await.map_err(|_| RecoveryError::ChannelClosed)?;
match available_data_response {
// Attempt post-recovery check.
Ok(data) => do_post_recovery_check(common_params, data)
.await
.inspect_err(|_| {
recovery_duration.map(|rd| rd.stop_and_discard());
})
.inspect(|_| {
gum::trace!(
target: LOG_TARGET,
candidate_hash = ?common_params.candidate_hash,
erasure_root = ?common_params.erasure_root,
"Data recovery from chunks complete",
);
}),
Err(err) => {
recovery_duration.map(|rd| rd.stop_and_discard());
gum::debug!(
target: LOG_TARGET,
candidate_hash = ?common_params.candidate_hash,
erasure_root = ?common_params.erasure_root,
?err,
"Data recovery error",
);
Err(RecoveryError::Invalid)
},
}
}
}
#[async_trait::async_trait]
impl<Sender: overseer::AvailabilityRecoverySenderTrait> RecoveryStrategy<Sender> for FetchChunks {
fn display_name(&self) -> &'static str {
"Fetch chunks"
}
fn strategy_type(&self) -> &'static str {
"regular_chunks"
}
async fn run(
mut self: Box<Self>,
state: &mut State,
sender: &mut Sender,
common_params: &RecoveryParams,
) -> Result<AvailableData, RecoveryError> {
// First query the store for any chunks we've got.
if !common_params.bypass_availability_store {
let local_chunk_indices = state.populate_from_av_store(common_params, sender).await;
self.validators.retain(|validator_index| {
!local_chunk_indices.iter().any(|(v_index, _)| v_index == validator_index)
});
}
// No need to query the validators that have the chunks we already received or that we know
// don't have the data from previous strategies.
self.validators.retain(|v_index| {
!state.received_chunks.values().any(|c| v_index == &c.validator_index) &&
state.can_retry_request(
&(common_params.validator_authority_keys[v_index.0 as usize].clone(), *v_index),
REGULAR_CHUNKS_REQ_RETRY_LIMIT,
)
});
// Safe to `take` here, as we're consuming `self` anyway and we're not using the
// `validators` field in other methods.
let mut validators_queue: VecDeque<_> = std::mem::take(&mut self.validators)
.into_iter()
.map(|validator_index| {
(
common_params.validator_authority_keys[validator_index.0 as usize].clone(),
validator_index,
)
})
.collect();
loop {
// If received_chunks has more than threshold entries, attempt to recover the data.
// If that fails, or a re-encoding of it doesn't match the expected erasure root,
// return Err(RecoveryError::Invalid).
// Do this before requesting any chunks because we may have enough of them coming from
// past RecoveryStrategies.
if state.chunk_count() >= common_params.threshold {
return self.attempt_recovery::<Sender>(state, common_params).await;
}
if Self::is_unavailable(
validators_queue.len(),
self.requesting_chunks.total_len(),
state.chunk_count(),
common_params.threshold,
) {
gum::debug!(
target: LOG_TARGET,
candidate_hash = ?common_params.candidate_hash,
erasure_root = ?common_params.erasure_root,
received = %state.chunk_count(),
requesting = %self.requesting_chunks.len(),
total_requesting = %self.requesting_chunks.total_len(),
n_validators = %common_params.n_validators,
"Data recovery from chunks is not possible",
);
return Err(RecoveryError::Unavailable);
}
let desired_requests_count =
self.get_desired_request_count(state.chunk_count(), common_params.threshold);
let already_requesting_count = self.requesting_chunks.len();
gum::debug!(
target: LOG_TARGET,
?common_params.candidate_hash,
?desired_requests_count,
error_count= ?self.error_count,
total_received = ?self.total_received_responses,
threshold = ?common_params.threshold,
?already_requesting_count,
"Requesting availability chunks for a candidate",
);
let strategy_type = RecoveryStrategy::<Sender>::strategy_type(&*self);
state
.launch_parallel_chunk_requests(
strategy_type,
common_params,
sender,
desired_requests_count,
&mut validators_queue,
&mut self.requesting_chunks,
)
.await;
let (total_responses, error_count) = state
.wait_for_chunks(
strategy_type,
common_params,
REGULAR_CHUNKS_REQ_RETRY_LIMIT,
&mut validators_queue,
&mut self.requesting_chunks,
&mut vec![],
|unrequested_validators,
in_flight_reqs,
chunk_count,
_systematic_chunk_count| {
chunk_count >= common_params.threshold ||
Self::is_unavailable(
unrequested_validators,
in_flight_reqs,
chunk_count,
common_params.threshold,
)
},
)
.await;
self.total_received_responses += total_responses;
self.error_count += error_count;
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use pezkuwi_erasure_coding::recovery_threshold;
#[test]
fn test_get_desired_request_count() {
let n_validators = 100;
let threshold = recovery_threshold(n_validators).unwrap();
let mut fetch_chunks_task = FetchChunks::new(FetchChunksParams { n_validators });
assert_eq!(fetch_chunks_task.get_desired_request_count(0, threshold), threshold);
fetch_chunks_task.error_count = 1;
fetch_chunks_task.total_received_responses = 1;
// We saturate at threshold (34):
assert_eq!(fetch_chunks_task.get_desired_request_count(0, threshold), threshold);
// We saturate at the parallel limit.
assert_eq!(fetch_chunks_task.get_desired_request_count(0, N_PARALLEL + 2), N_PARALLEL);
fetch_chunks_task.total_received_responses = 2;
// With given error rate - still saturating:
assert_eq!(fetch_chunks_task.get_desired_request_count(1, threshold), threshold);
fetch_chunks_task.total_received_responses = 10;
// error rate: 1/10
// remaining chunks needed: threshold (34) - 9
// expected: 24 * (1+ 1/10) = (next greater integer) = 27
assert_eq!(fetch_chunks_task.get_desired_request_count(9, threshold), 27);
// We saturate at the parallel limit.
assert_eq!(fetch_chunks_task.get_desired_request_count(9, N_PARALLEL + 9), N_PARALLEL);
fetch_chunks_task.error_count = 0;
// With error count zero - we should fetch exactly as needed:
assert_eq!(fetch_chunks_task.get_desired_request_count(10, threshold), threshold - 10);
}
}
@@ -0,0 +1,174 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use crate::{
task::{RecoveryParams, RecoveryStrategy, State},
ErasureTask, PostRecoveryCheck, LOG_TARGET,
};
use pezkuwi_node_network_protocol::request_response::{
self as req_res, outgoing::RequestError, OutgoingRequest, Recipient, Requests,
};
use pezkuwi_node_primitives::AvailableData;
use pezkuwi_node_subsystem::{messages::NetworkBridgeTxMessage, overseer, RecoveryError};
use pezkuwi_primitives::ValidatorIndex;
use sc_network::{IfDisconnected, OutboundFailure, RequestFailure};
use futures::{channel::oneshot, SinkExt};
use rand::seq::SliceRandom;
/// Parameters specific to the `FetchFull` strategy.
pub struct FetchFullParams {
/// Validators that will be used for fetching the data.
pub validators: Vec<ValidatorIndex>,
}
/// `RecoveryStrategy` that sequentially tries to fetch the full `AvailableData` from
/// already-connected validators in the configured validator set.
pub struct FetchFull {
params: FetchFullParams,
}
impl FetchFull {
/// Create a new `FetchFull` recovery strategy.
pub fn new(mut params: FetchFullParams) -> Self {
params.validators.shuffle(&mut rand::thread_rng());
Self { params }
}
}
#[async_trait::async_trait]
impl<Sender: overseer::AvailabilityRecoverySenderTrait> RecoveryStrategy<Sender> for FetchFull {
fn display_name(&self) -> &'static str {
"Full recovery from backers"
}
fn strategy_type(&self) -> &'static str {
"full_from_backers"
}
async fn run(
mut self: Box<Self>,
_: &mut State,
sender: &mut Sender,
common_params: &RecoveryParams,
) -> Result<AvailableData, RecoveryError> {
let strategy_type = RecoveryStrategy::<Sender>::strategy_type(&*self);
loop {
// Pop the next validator.
let validator_index =
self.params.validators.pop().ok_or_else(|| RecoveryError::Unavailable)?;
// Request data.
let (req, response) = OutgoingRequest::new(
Recipient::Authority(
common_params.validator_authority_keys[validator_index.0 as usize].clone(),
),
req_res::v1::AvailableDataFetchingRequest {
candidate_hash: common_params.candidate_hash,
},
);
sender
.send_message(NetworkBridgeTxMessage::SendRequests(
vec![Requests::AvailableDataFetchingV1(req)],
IfDisconnected::ImmediateError,
))
.await;
common_params.metrics.on_full_request_issued();
match response.await {
Ok(req_res::v1::AvailableDataFetchingResponse::AvailableData(data)) => {
let recovery_duration =
common_params.metrics.time_erasure_recovery(strategy_type);
let maybe_data = match common_params.post_recovery_check {
PostRecoveryCheck::Reencode => {
let (reencode_tx, reencode_rx) = oneshot::channel();
let mut erasure_task_tx = common_params.erasure_task_tx.clone();
erasure_task_tx
.send(ErasureTask::Reencode(
common_params.n_validators,
common_params.erasure_root,
data,
reencode_tx,
))
.await
.map_err(|_| RecoveryError::ChannelClosed)?;
reencode_rx.await.map_err(|_| RecoveryError::ChannelClosed)?
},
PostRecoveryCheck::PovHash =>
(data.pov.hash() == common_params.pov_hash).then_some(data),
};
match maybe_data {
Some(data) => {
gum::trace!(
target: LOG_TARGET,
candidate_hash = ?common_params.candidate_hash,
"Received full data",
);
common_params.metrics.on_full_request_succeeded();
return Ok(data);
},
None => {
common_params.metrics.on_full_request_invalid();
recovery_duration.map(|rd| rd.stop_and_discard());
gum::debug!(
target: LOG_TARGET,
candidate_hash = ?common_params.candidate_hash,
?validator_index,
"Invalid data response",
);
// it doesn't help to report the peer with req/res.
// we'll try the next backer.
},
}
},
Ok(req_res::v1::AvailableDataFetchingResponse::NoSuchData) => {
common_params.metrics.on_full_request_no_such_data();
},
Err(e) => {
match &e {
RequestError::Canceled(_) => common_params.metrics.on_full_request_error(),
RequestError::InvalidResponse(_) =>
common_params.metrics.on_full_request_invalid(),
RequestError::NetworkError(req_failure) => {
if let RequestFailure::Network(OutboundFailure::Timeout) = req_failure {
common_params.metrics.on_full_request_timeout();
} else {
common_params.metrics.on_full_request_error();
}
},
};
gum::debug!(
target: LOG_TARGET,
candidate_hash = ?common_params.candidate_hash,
?validator_index,
err = ?e,
"Error fetching full available data."
);
},
}
}
}
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,341 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use crate::{
futures_undead::FuturesUndead,
task::{
strategy::{
do_post_recovery_check, is_unavailable, OngoingRequests, N_PARALLEL,
SYSTEMATIC_CHUNKS_REQ_RETRY_LIMIT,
},
RecoveryParams, RecoveryStrategy, State,
},
LOG_TARGET,
};
use pezkuwi_node_primitives::AvailableData;
use pezkuwi_node_subsystem::{overseer, RecoveryError};
use pezkuwi_primitives::{ChunkIndex, ValidatorIndex};
use std::collections::VecDeque;
/// Parameters needed for fetching systematic chunks.
pub struct FetchSystematicChunksParams {
/// Validators that hold the systematic chunks.
pub validators: Vec<(ChunkIndex, ValidatorIndex)>,
/// Validators in the backing group, to be used as a backup for requesting systematic chunks.
pub backers: Vec<ValidatorIndex>,
}
/// `RecoveryStrategy` that attempts to recover the systematic chunks from the validators that
/// hold them, in order to bypass the erasure code reconstruction step, which is costly.
pub struct FetchSystematicChunks {
/// Systematic recovery threshold.
threshold: usize,
/// Validators that hold the systematic chunks.
validators: Vec<(ChunkIndex, ValidatorIndex)>,
/// Backers to be used as a backup.
backers: Vec<ValidatorIndex>,
/// Collection of in-flight requests.
requesting_chunks: OngoingRequests,
}
impl FetchSystematicChunks {
/// Instantiate a new systematic chunks strategy.
pub fn new(params: FetchSystematicChunksParams) -> Self {
Self {
threshold: params.validators.len(),
validators: params.validators,
backers: params.backers,
requesting_chunks: FuturesUndead::new(),
}
}
fn is_unavailable(
unrequested_validators: usize,
in_flight_requests: usize,
systematic_chunk_count: usize,
threshold: usize,
) -> bool {
is_unavailable(
systematic_chunk_count,
in_flight_requests,
unrequested_validators,
threshold,
)
}
/// Desired number of parallel requests.
///
/// For the given threshold (total required number of chunks) get the desired number of
/// requests we want to have running in parallel at this time.
fn get_desired_request_count(&self, chunk_count: usize, threshold: usize) -> usize {
// Upper bound for parallel requests.
let max_requests_boundary = std::cmp::min(N_PARALLEL, threshold);
// How many chunks are still needed?
let remaining_chunks = threshold.saturating_sub(chunk_count);
// Actual number of requests we want to have in flight in parallel:
// We don't have to make up for any error rate, as an error fetching a systematic chunk
// results in failure of the entire strategy.
std::cmp::min(max_requests_boundary, remaining_chunks)
}
async fn attempt_systematic_recovery<Sender: overseer::AvailabilityRecoverySenderTrait>(
&mut self,
state: &mut State,
common_params: &RecoveryParams,
) -> Result<AvailableData, RecoveryError> {
let strategy_type = RecoveryStrategy::<Sender>::strategy_type(self);
let recovery_duration = common_params.metrics.time_erasure_recovery(strategy_type);
let reconstruct_duration = common_params.metrics.time_erasure_reconstruct(strategy_type);
let chunks = state
.received_chunks
.range(
ChunkIndex(0)..
ChunkIndex(
u32::try_from(self.threshold)
.expect("validator count should not exceed u32"),
),
)
.map(|(_, chunk)| chunk.chunk.clone())
.collect::<Vec<_>>();
let available_data = pezkuwi_erasure_coding::reconstruct_from_systematic_v1(
common_params.n_validators,
chunks,
);
match available_data {
Ok(data) => {
drop(reconstruct_duration);
// Attempt post-recovery check.
do_post_recovery_check(common_params, data)
.await
.inspect_err(|_| {
recovery_duration.map(|rd| rd.stop_and_discard());
})
.inspect(|_| {
gum::trace!(
target: LOG_TARGET,
candidate_hash = ?common_params.candidate_hash,
erasure_root = ?common_params.erasure_root,
"Data recovery from systematic chunks complete",
);
})
},
Err(err) => {
reconstruct_duration.map(|rd| rd.stop_and_discard());
recovery_duration.map(|rd| rd.stop_and_discard());
gum::debug!(
target: LOG_TARGET,
candidate_hash = ?common_params.candidate_hash,
erasure_root = ?common_params.erasure_root,
?err,
"Systematic data recovery error",
);
Err(RecoveryError::Invalid)
},
}
}
}
#[async_trait::async_trait]
impl<Sender: overseer::AvailabilityRecoverySenderTrait> RecoveryStrategy<Sender>
for FetchSystematicChunks
{
fn display_name(&self) -> &'static str {
"Fetch systematic chunks"
}
fn strategy_type(&self) -> &'static str {
"systematic_chunks"
}
async fn run(
mut self: Box<Self>,
state: &mut State,
sender: &mut Sender,
common_params: &RecoveryParams,
) -> Result<AvailableData, RecoveryError> {
// First query the store for any chunks we've got.
if !common_params.bypass_availability_store {
let local_chunk_indices = state.populate_from_av_store(common_params, sender).await;
for (_, our_c_index) in &local_chunk_indices {
// If we are among the systematic validators but hold an invalid chunk, we cannot
// perform the systematic recovery. Fall through to the next strategy.
if self.validators.iter().any(|(c_index, _)| c_index == our_c_index) &&
!state.received_chunks.contains_key(our_c_index)
{
gum::debug!(
target: LOG_TARGET,
candidate_hash = ?common_params.candidate_hash,
erasure_root = ?common_params.erasure_root,
requesting = %self.requesting_chunks.len(),
total_requesting = %self.requesting_chunks.total_len(),
n_validators = %common_params.n_validators,
chunk_index = ?our_c_index,
"Systematic chunk recovery is not possible. We are among the systematic validators but hold an invalid chunk",
);
return Err(RecoveryError::Unavailable);
}
}
}
// No need to query the validators that have the chunks we already received or that we know
// don't have the data from previous strategies.
self.validators.retain(|(c_index, v_index)| {
!state.received_chunks.contains_key(c_index) &&
state.can_retry_request(
&(common_params.validator_authority_keys[v_index.0 as usize].clone(), *v_index),
SYSTEMATIC_CHUNKS_REQ_RETRY_LIMIT,
)
});
let mut systematic_chunk_count = state
.received_chunks
.range(ChunkIndex(0)..ChunkIndex(self.threshold as u32))
.count();
// Safe to `take` here, as we're consuming `self` anyway and we're not using the
// `validators` or `backers` fields in other methods.
let mut validators_queue: VecDeque<_> = std::mem::take(&mut self.validators)
.into_iter()
.map(|(_, validator_index)| {
(
common_params.validator_authority_keys[validator_index.0 as usize].clone(),
validator_index,
)
})
.collect();
let mut backers: Vec<_> = std::mem::take(&mut self.backers)
.into_iter()
.map(|validator_index| {
common_params.validator_authority_keys[validator_index.0 as usize].clone()
})
.collect();
loop {
// If received_chunks has `systematic_chunk_threshold` entries, attempt to recover the
// data.
if systematic_chunk_count >= self.threshold {
return self.attempt_systematic_recovery::<Sender>(state, common_params).await;
}
if Self::is_unavailable(
validators_queue.len(),
self.requesting_chunks.total_len(),
systematic_chunk_count,
self.threshold,
) {
gum::debug!(
target: LOG_TARGET,
candidate_hash = ?common_params.candidate_hash,
erasure_root = ?common_params.erasure_root,
%systematic_chunk_count,
requesting = %self.requesting_chunks.len(),
total_requesting = %self.requesting_chunks.total_len(),
n_validators = %common_params.n_validators,
systematic_threshold = ?self.threshold,
"Data recovery from systematic chunks is not possible",
);
return Err(RecoveryError::Unavailable);
}
let desired_requests_count =
self.get_desired_request_count(systematic_chunk_count, self.threshold);
let already_requesting_count = self.requesting_chunks.len();
gum::debug!(
target: LOG_TARGET,
?common_params.candidate_hash,
?desired_requests_count,
total_received = ?systematic_chunk_count,
systematic_threshold = ?self.threshold,
?already_requesting_count,
"Requesting systematic availability chunks for a candidate",
);
let strategy_type = RecoveryStrategy::<Sender>::strategy_type(&*self);
state
.launch_parallel_chunk_requests(
strategy_type,
common_params,
sender,
desired_requests_count,
&mut validators_queue,
&mut self.requesting_chunks,
)
.await;
let _ = state
.wait_for_chunks(
strategy_type,
common_params,
SYSTEMATIC_CHUNKS_REQ_RETRY_LIMIT,
&mut validators_queue,
&mut self.requesting_chunks,
&mut backers,
|unrequested_validators,
in_flight_reqs,
// Don't use this chunk count, as it may contain non-systematic chunks.
_chunk_count,
new_systematic_chunk_count| {
systematic_chunk_count = new_systematic_chunk_count;
let is_unavailable = Self::is_unavailable(
unrequested_validators,
in_flight_reqs,
systematic_chunk_count,
self.threshold,
);
systematic_chunk_count >= self.threshold || is_unavailable
},
)
.await;
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use pezkuwi_erasure_coding::systematic_recovery_threshold;
#[test]
fn test_get_desired_request_count() {
let num_validators = 100;
let threshold = systematic_recovery_threshold(num_validators).unwrap();
let systematic_chunks_task = FetchSystematicChunks::new(FetchSystematicChunksParams {
validators: vec![(1.into(), 1.into()); num_validators],
backers: vec![],
});
assert_eq!(systematic_chunks_task.get_desired_request_count(0, threshold), threshold);
assert_eq!(systematic_chunks_task.get_desired_request_count(5, threshold), threshold - 5);
assert_eq!(
systematic_chunks_task.get_desired_request_count(num_validators * 2, threshold),
0
);
assert_eq!(systematic_chunks_task.get_desired_request_count(0, N_PARALLEL * 2), N_PARALLEL);
assert_eq!(systematic_chunks_task.get_desired_request_count(N_PARALLEL, N_PARALLEL + 2), 2);
}
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,47 @@
[package]
name = "pezkuwi-availability-bitfield-distribution"
version = "7.0.0"
description = "Pezkuwi Bitfiled Distribution subsystem, which gossips signed availability bitfields used to compactly determine which backed candidates are available or not based on a 2/3+ quorum."
authors.workspace = true
edition.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
[lints]
workspace = true
[dependencies]
futures = { workspace = true }
futures-timer = { workspace = true }
gum = { workspace = true, default-features = true }
pezkuwi-node-network-protocol = { workspace = true, default-features = true }
pezkuwi-node-subsystem = { workspace = true, default-features = true }
pezkuwi-node-subsystem-util = { workspace = true, default-features = true }
pezkuwi-primitives = { workspace = true, default-features = true }
rand = { workspace = true, default-features = true }
[dev-dependencies]
assert_matches = { workspace = true }
bitvec = { features = ["alloc"], workspace = true }
maplit = { workspace = true }
pezkuwi-node-subsystem-test-helpers = { workspace = true }
rand_chacha = { workspace = true, default-features = true }
sp-application-crypto = { workspace = true, default-features = true }
sp-authority-discovery = { workspace = true, default-features = true }
sp-core = { workspace = true, default-features = true }
sp-keyring = { workspace = true, default-features = true }
sp-keystore = { workspace = true, default-features = true }
sp-tracing = { workspace = true }
[features]
runtime-benchmarks = [
"gum/runtime-benchmarks",
"pezkuwi-node-network-protocol/runtime-benchmarks",
"pezkuwi-node-subsystem-test-helpers/runtime-benchmarks",
"pezkuwi-node-subsystem-util/runtime-benchmarks",
"pezkuwi-node-subsystem/runtime-benchmarks",
"pezkuwi-primitives/runtime-benchmarks",
"sp-authority-discovery/runtime-benchmarks",
"sp-keyring/runtime-benchmarks",
]
@@ -0,0 +1,919 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//! The bitfield distribution
//!
//! In case this node is a validator, gossips its own signed availability bitfield
//! for a particular relay parent.
//! Independently of that, gossips on received messages from peers to other interested peers.
#![deny(unused_crate_dependencies)]
use futures::{channel::oneshot, FutureExt};
use net_protocol::filter_by_peer_version;
use pezkuwi_node_network_protocol::{
self as net_protocol,
grid_topology::{
GridNeighbors, RandomRouting, RequiredRouting, SessionBoundGridTopologyStorage,
},
peer_set::{ProtocolVersion, ValidationVersion},
v3 as protocol_v3, OurView, PeerId, UnifiedReputationChange as Rep, ValidationProtocols, View,
};
use pezkuwi_node_subsystem::{
messages::*, overseer, ActiveLeavesUpdate, FromOrchestra, OverseerSignal, SpawnedSubsystem,
SubsystemError, SubsystemResult,
};
use pezkuwi_node_subsystem_util::{
self as util,
reputation::{ReputationAggregator, REPUTATION_CHANGE_INTERVAL},
};
use futures::select;
use pezkuwi_primitives::{Hash, SignedAvailabilityBitfield, SigningContext, ValidatorId};
use rand::{CryptoRng, Rng, SeedableRng};
use std::{
collections::{HashMap, HashSet},
time::Duration,
};
use self::metrics::Metrics;
mod metrics;
#[cfg(test)]
mod tests;
const COST_SIGNATURE_INVALID: Rep = Rep::CostMajor("Bitfield signature invalid");
const COST_VALIDATOR_INDEX_INVALID: Rep = Rep::CostMajor("Bitfield validator index invalid");
const COST_MISSING_PEER_SESSION_KEY: Rep = Rep::CostMinor("Missing peer session key");
const COST_NOT_IN_VIEW: Rep = Rep::CostMinor("Not interested in that parent hash");
const COST_PEER_DUPLICATE_MESSAGE: Rep =
Rep::CostMinorRepeated("Peer sent the same message multiple times");
const BENEFIT_VALID_MESSAGE_FIRST: Rep =
Rep::BenefitMinorFirst("Valid message with new information");
const BENEFIT_VALID_MESSAGE: Rep = Rep::BenefitMinor("Valid message");
/// Checked signed availability bitfield that is distributed
/// to other peers.
#[derive(Debug, Clone, PartialEq, Eq)]
struct BitfieldGossipMessage {
/// The relay parent this message is relative to.
relay_parent: Hash,
/// The actual signed availability bitfield.
signed_availability: SignedAvailabilityBitfield,
}
impl BitfieldGossipMessage {
fn into_validation_protocol(
self,
recipient_version: ProtocolVersion,
) -> net_protocol::VersionedValidationProtocol {
self.into_network_message(recipient_version).into()
}
fn into_network_message(
self,
recipient_version: ProtocolVersion,
) -> net_protocol::BitfieldDistributionMessage {
match ValidationVersion::try_from(recipient_version).ok() {
Some(ValidationVersion::V3) =>
ValidationProtocols::V3(protocol_v3::BitfieldDistributionMessage::Bitfield(
self.relay_parent,
self.signed_availability.into(),
)),
None => {
gum::warn!(
target: LOG_TARGET,
version = ?recipient_version,
"Unknown protocol version provided for message recipient"
);
// fall back to v3 to avoid
ValidationProtocols::V3(protocol_v3::BitfieldDistributionMessage::Bitfield(
self.relay_parent,
self.signed_availability.into(),
))
},
}
}
}
/// Data stored on a per-peer basis.
#[derive(Debug)]
pub struct PeerData {
/// The peer's view.
view: View,
/// The peer's protocol version.
version: ProtocolVersion,
}
/// Data used to track information of peers and relay parents the
/// overseer ordered us to work on.
#[derive(Default)]
struct ProtocolState {
/// Track all active peer views and protocol versions
/// to determine what is relevant to them.
peer_data: HashMap<PeerId, PeerData>,
/// The current and previous gossip topologies
topologies: SessionBoundGridTopologyStorage,
/// Our current view.
view: OurView,
/// Additional data particular to a relay parent.
per_relay_parent: HashMap<Hash, PerRelayParentData>,
/// Aggregated reputation change
reputation: ReputationAggregator,
}
/// Data for a particular relay parent.
#[derive(Debug)]
struct PerRelayParentData {
/// Signing context for a particular relay parent.
signing_context: SigningContext,
/// Set of validators for a particular relay parent.
validator_set: Vec<ValidatorId>,
/// Set of validators for a particular relay parent for which we
/// received a valid `BitfieldGossipMessage`.
/// Also serves as the list of known messages for peers connecting
/// after bitfield gossips were already received.
one_per_validator: HashMap<ValidatorId, BitfieldGossipMessage>,
/// Avoid duplicate message transmission to our peers.
message_sent_to_peer: HashMap<PeerId, HashSet<ValidatorId>>,
/// Track messages that were already received by a peer
/// to prevent flooding.
message_received_from_peer: HashMap<PeerId, HashSet<ValidatorId>>,
}
impl PerRelayParentData {
/// Create a new instance.
fn new(signing_context: SigningContext, validator_set: Vec<ValidatorId>) -> Self {
Self {
signing_context,
validator_set,
one_per_validator: Default::default(),
message_sent_to_peer: Default::default(),
message_received_from_peer: Default::default(),
}
}
/// Determines if that particular message signed by a
/// validator is needed by the given peer.
fn message_from_validator_needed_by_peer(
&self,
peer: &PeerId,
signed_by: &ValidatorId,
) -> bool {
self.message_sent_to_peer
.get(peer)
.map(|pubkeys| !pubkeys.contains(signed_by))
.unwrap_or(true) &&
self.message_received_from_peer
.get(peer)
.map(|pubkeys| !pubkeys.contains(signed_by))
.unwrap_or(true)
}
}
const LOG_TARGET: &str = "teyrchain::bitfield-distribution";
/// The bitfield distribution subsystem.
pub struct BitfieldDistribution {
metrics: Metrics,
}
#[overseer::contextbounds(BitfieldDistribution, prefix = self::overseer)]
impl BitfieldDistribution {
/// Create a new instance of the `BitfieldDistribution` subsystem.
pub fn new(metrics: Metrics) -> Self {
Self { metrics }
}
/// Start processing work as passed on from the Overseer.
async fn run<Context>(self, ctx: Context) {
let mut state = ProtocolState::default();
let mut rng = rand::rngs::StdRng::from_entropy();
self.run_inner(ctx, &mut state, REPUTATION_CHANGE_INTERVAL, &mut rng).await
}
async fn run_inner<Context>(
self,
mut ctx: Context,
state: &mut ProtocolState,
reputation_interval: Duration,
rng: &mut (impl CryptoRng + Rng),
) {
// work: process incoming messages from the overseer and process accordingly.
let new_reputation_delay = || futures_timer::Delay::new(reputation_interval).fuse();
let mut reputation_delay = new_reputation_delay();
loop {
select! {
_ = reputation_delay => {
state.reputation.send(ctx.sender()).await;
reputation_delay = new_reputation_delay();
},
message = ctx.recv().fuse() => {
let message = match message {
Ok(message) => message,
Err(err) => {
gum::error!(
target: LOG_TARGET,
?err,
"Failed to receive a message from Overseer, exiting"
);
return
},
};
match message {
FromOrchestra::Communication {
msg:
BitfieldDistributionMessage::DistributeBitfield(
relay_parent,
signed_availability,
),
} => {
gum::trace!(target: LOG_TARGET, ?relay_parent, "Processing DistributeBitfield");
handle_bitfield_distribution(
&mut ctx,
state,
&self.metrics,
relay_parent,
signed_availability,
rng,
)
.await;
},
FromOrchestra::Communication {
msg: BitfieldDistributionMessage::NetworkBridgeUpdate(event),
} => {
gum::trace!(target: LOG_TARGET, "Processing NetworkMessage");
// a network message was received
handle_network_msg(&mut ctx, state, &self.metrics, event, rng).await;
},
FromOrchestra::Signal(OverseerSignal::ActiveLeaves(ActiveLeavesUpdate {
activated,
..
})) => {
let _timer = self.metrics.time_active_leaves_update();
if let Some(activated) = activated {
let relay_parent = activated.hash;
gum::trace!(target: LOG_TARGET, ?relay_parent, "activated");
// query validator set and signing context per relay_parent once only
match query_basics(&mut ctx, relay_parent).await {
Ok(Some((validator_set, signing_context))) => {
// If our runtime API fails, we don't take down the node,
// but we might alter peers' reputations erroneously as a result
// of not having the correct bookkeeping. If we have lost a race
// with state pruning, it is unlikely that peers will be sending
// us anything to do with this relay-parent anyway.
let _ = state.per_relay_parent.insert(
relay_parent,
PerRelayParentData::new(signing_context, validator_set),
);
},
Err(err) => {
gum::warn!(target: LOG_TARGET, ?err, "query_basics has failed");
},
_ => {},
}
}
},
FromOrchestra::Signal(OverseerSignal::BlockFinalized(hash, number)) => {
gum::trace!(target: LOG_TARGET, ?hash, %number, "block finalized");
},
FromOrchestra::Signal(OverseerSignal::Conclude) => {
gum::info!(target: LOG_TARGET, "Conclude");
return
},
}
}
}
}
}
}
/// Modify the reputation of a peer based on its behavior.
async fn modify_reputation(
reputation: &mut ReputationAggregator,
sender: &mut impl overseer::BitfieldDistributionSenderTrait,
relay_parent: Hash,
peer: PeerId,
rep: Rep,
) {
gum::trace!(target: LOG_TARGET, ?relay_parent, ?rep, %peer, "reputation change");
reputation.modify(sender, peer, rep).await;
}
/// Distribute a given valid and signature checked bitfield message.
///
/// For this variant the source is this node.
#[overseer::contextbounds(BitfieldDistribution, prefix=self::overseer)]
async fn handle_bitfield_distribution<Context>(
ctx: &mut Context,
state: &mut ProtocolState,
metrics: &Metrics,
relay_parent: Hash,
signed_availability: SignedAvailabilityBitfield,
rng: &mut (impl CryptoRng + Rng),
) {
let _timer = metrics.time_handle_bitfield_distribution();
// Ignore anything the overseer did not tell this subsystem to work on
let mut job_data = state.per_relay_parent.get_mut(&relay_parent);
let job_data: &mut _ = if let Some(ref mut job_data) = job_data {
job_data
} else {
gum::debug!(
target: LOG_TARGET,
?relay_parent,
"Not supposed to work on relay parent related data",
);
return;
};
let session_idx = job_data.signing_context.session_index;
let validator_set = &job_data.validator_set;
if validator_set.is_empty() {
gum::debug!(target: LOG_TARGET, ?relay_parent, "validator set is empty");
return;
}
let validator_index = signed_availability.validator_index();
let validator = if let Some(validator) = validator_set.get(validator_index.0 as usize) {
validator.clone()
} else {
gum::debug!(target: LOG_TARGET, validator_index = ?validator_index.0, "Could not find a validator for index");
return;
};
let msg = BitfieldGossipMessage { relay_parent, signed_availability };
let topology = state.topologies.get_topology_or_fallback(session_idx).local_grid_neighbors();
let required_routing = topology.required_routing_by_index(validator_index, true);
relay_message(
ctx,
job_data,
topology,
&mut state.peer_data,
validator,
msg,
required_routing,
rng,
)
.await;
metrics.on_own_bitfield_sent();
}
/// Distribute a given valid and signature checked bitfield message.
///
/// Can be originated by another subsystem or received via network from another peer.
#[overseer::contextbounds(BitfieldDistribution, prefix=self::overseer)]
async fn relay_message<Context>(
ctx: &mut Context,
job_data: &mut PerRelayParentData,
topology_neighbors: &GridNeighbors,
peers: &mut HashMap<PeerId, PeerData>,
validator: ValidatorId,
message: BitfieldGossipMessage,
required_routing: RequiredRouting,
rng: &mut (impl CryptoRng + Rng),
) {
let relay_parent = message.relay_parent;
// notify the overseer about a new and valid signed bitfield
ctx.send_message(ProvisionerMessage::ProvisionableData(
relay_parent,
ProvisionableData::Bitfield(relay_parent, message.signed_availability.clone()),
))
.await;
let total_peers = peers.len();
let mut random_routing: RandomRouting = Default::default();
// pass on the bitfield distribution to all interested peers
let interested_peers = peers
.iter()
.filter_map(|(peer, data)| {
// check interest in the peer in this message's relay parent
if data.view.contains(&message.relay_parent) {
let message_needed =
job_data.message_from_validator_needed_by_peer(&peer, &validator);
if message_needed {
let in_topology = topology_neighbors.route_to_peer(required_routing, &peer);
let need_routing = in_topology || {
let route_random = random_routing.sample(total_peers, rng);
if route_random {
random_routing.inc_sent();
}
route_random
};
if need_routing {
Some((*peer, data.version))
} else {
None
}
} else {
None
}
} else {
None
}
})
.collect::<Vec<(PeerId, ProtocolVersion)>>();
interested_peers.iter().for_each(|(peer, _)| {
// track the message as sent for this peer
job_data
.message_sent_to_peer
.entry(*peer)
.or_default()
.insert(validator.clone());
});
if interested_peers.is_empty() {
gum::trace!(
target: LOG_TARGET,
?relay_parent,
"no peers are interested in gossip for relay parent",
);
} else {
let v3_interested_peers =
filter_by_peer_version(&interested_peers, ValidationVersion::V3.into());
if !v3_interested_peers.is_empty() {
ctx.send_message(NetworkBridgeTxMessage::SendValidationMessage(
v3_interested_peers,
message.into_validation_protocol(ValidationVersion::V3.into()),
))
.await
}
}
}
/// Handle an incoming message from a peer.
#[overseer::contextbounds(BitfieldDistribution, prefix=self::overseer)]
async fn process_incoming_peer_message<Context>(
ctx: &mut Context,
state: &mut ProtocolState,
metrics: &Metrics,
origin: PeerId,
message: net_protocol::BitfieldDistributionMessage,
rng: &mut (impl CryptoRng + Rng),
) {
let (relay_parent, bitfield) = match message {
ValidationProtocols::V3(protocol_v3::BitfieldDistributionMessage::Bitfield(
relay_parent,
bitfield,
)) => (relay_parent, bitfield),
};
gum::trace!(
target: LOG_TARGET,
peer = %origin,
?relay_parent,
"received bitfield gossip from peer"
);
// we don't care about this, not part of our view.
if !state.view.contains(&relay_parent) {
modify_reputation(
&mut state.reputation,
ctx.sender(),
relay_parent,
origin,
COST_NOT_IN_VIEW,
)
.await;
return;
}
// Ignore anything the overseer did not tell this subsystem to work on.
let mut job_data = state.per_relay_parent.get_mut(&relay_parent);
let job_data: &mut _ = if let Some(ref mut job_data) = job_data {
job_data
} else {
modify_reputation(
&mut state.reputation,
ctx.sender(),
relay_parent,
origin,
COST_NOT_IN_VIEW,
)
.await;
return;
};
let validator_index = bitfield.unchecked_validator_index();
let validator_set = &job_data.validator_set;
if validator_set.is_empty() {
gum::trace!(target: LOG_TARGET, ?relay_parent, ?origin, "Validator set is empty",);
modify_reputation(
&mut state.reputation,
ctx.sender(),
relay_parent,
origin,
COST_MISSING_PEER_SESSION_KEY,
)
.await;
return;
}
// Use the (untrusted) validator index provided by the signed payload
// and see if that one actually signed the availability bitset.
let signing_context = job_data.signing_context.clone();
let validator = if let Some(validator) = validator_set.get(validator_index.0 as usize) {
validator.clone()
} else {
modify_reputation(
&mut state.reputation,
ctx.sender(),
relay_parent,
origin,
COST_VALIDATOR_INDEX_INVALID,
)
.await;
return;
};
// Check if the peer already sent us a message for the validator denoted in the message earlier.
// Must be done after validator index verification, in order to avoid storing an unbounded
// number of set entries.
let received_set = job_data.message_received_from_peer.entry(origin).or_default();
if !received_set.contains(&validator) {
received_set.insert(validator.clone());
} else {
gum::trace!(target: LOG_TARGET, ?validator_index, ?origin, "Duplicate message");
modify_reputation(
&mut state.reputation,
ctx.sender(),
relay_parent,
origin,
COST_PEER_DUPLICATE_MESSAGE,
)
.await;
return;
};
let one_per_validator = &mut (job_data.one_per_validator);
// relay a message received from a validator at most _once_
if let Some(old_message) = one_per_validator.get(&validator) {
gum::trace!(
target: LOG_TARGET,
?validator_index,
"already received a message for validator",
);
if old_message.signed_availability.as_unchecked() == &bitfield {
modify_reputation(
&mut state.reputation,
ctx.sender(),
relay_parent,
origin,
BENEFIT_VALID_MESSAGE,
)
.await;
}
return;
}
let signed_availability = match bitfield.try_into_checked(&signing_context, &validator) {
Err(_) => {
modify_reputation(
&mut state.reputation,
ctx.sender(),
relay_parent,
origin,
COST_SIGNATURE_INVALID,
)
.await;
return;
},
Ok(bitfield) => bitfield,
};
let message = BitfieldGossipMessage { relay_parent, signed_availability };
let topology = state
.topologies
.get_topology_or_fallback(job_data.signing_context.session_index)
.local_grid_neighbors();
let required_routing = topology.required_routing_by_index(validator_index, false);
metrics.on_bitfield_received();
one_per_validator.insert(validator.clone(), message.clone());
relay_message(
ctx,
job_data,
topology,
&mut state.peer_data,
validator,
message,
required_routing,
rng,
)
.await;
modify_reputation(
&mut state.reputation,
ctx.sender(),
relay_parent,
origin,
BENEFIT_VALID_MESSAGE_FIRST,
)
.await
}
/// Deal with network bridge updates and track what needs to be tracked
/// which depends on the message type received.
#[overseer::contextbounds(BitfieldDistribution, prefix=self::overseer)]
async fn handle_network_msg<Context>(
ctx: &mut Context,
state: &mut ProtocolState,
metrics: &Metrics,
bridge_message: NetworkBridgeEvent<net_protocol::BitfieldDistributionMessage>,
rng: &mut (impl CryptoRng + Rng),
) {
let _timer = metrics.time_handle_network_msg();
match bridge_message {
NetworkBridgeEvent::PeerConnected(peer, role, version, _) => {
gum::trace!(target: LOG_TARGET, ?peer, ?role, "Peer connected");
// insert if none already present
state
.peer_data
.entry(peer)
.or_insert_with(|| PeerData { view: View::default(), version });
},
NetworkBridgeEvent::PeerDisconnected(peer) => {
gum::trace!(target: LOG_TARGET, ?peer, "Peer disconnected");
// get rid of superfluous data
state.peer_data.remove(&peer);
},
NetworkBridgeEvent::NewGossipTopology(gossip_topology) => {
let session_index = gossip_topology.session;
let new_topology = gossip_topology.topology;
let prev_neighbors =
state.topologies.get_current_topology().local_grid_neighbors().clone();
state.topologies.update_topology(
session_index,
new_topology,
gossip_topology.local_index,
);
let current_topology = state.topologies.get_current_topology();
let newly_added = current_topology.local_grid_neighbors().peers_diff(&prev_neighbors);
gum::debug!(
target: LOG_TARGET,
?session_index,
newly_added_peers = ?newly_added.len(),
"New gossip topology received",
);
for new_peer in newly_added {
let old_view = match state.peer_data.get_mut(&new_peer) {
Some(d) => {
// in case we already knew that peer in the past
// it might have had an existing view, we use to initialize
// and minimize the delta on `PeerViewChange` to be sent
std::mem::replace(&mut d.view, Default::default())
},
None => {
// For peers which are currently unknown, we'll send topology-related
// messages to them when they connect and send their first view update.
continue;
},
};
handle_peer_view_change(ctx, state, new_peer, old_view, rng).await;
}
},
NetworkBridgeEvent::PeerViewChange(peer_id, new_view) => {
gum::trace!(target: LOG_TARGET, ?peer_id, ?new_view, "Peer view change");
if state.peer_data.get(&peer_id).is_some() {
handle_peer_view_change(ctx, state, peer_id, new_view, rng).await;
}
},
NetworkBridgeEvent::OurViewChange(new_view) => {
gum::trace!(target: LOG_TARGET, ?new_view, "Our view change");
handle_our_view_change(state, new_view);
},
NetworkBridgeEvent::PeerMessage(remote, message) =>
process_incoming_peer_message(ctx, state, metrics, remote, message, rng).await,
NetworkBridgeEvent::UpdatedAuthorityIds(peer_id, authority_ids) => {
state
.topologies
.get_current_topology_mut()
.update_authority_ids(peer_id, &authority_ids);
},
}
}
/// Handle the changes necessary when our view changes.
fn handle_our_view_change(state: &mut ProtocolState, view: OurView) {
let old_view = std::mem::replace(&mut (state.view), view);
for added in state.view.difference(&old_view) {
if !state.per_relay_parent.contains_key(&added) {
// Is guaranteed to be handled in `ActiveHead` update
// so this should never happen.
gum::error!(
target: LOG_TARGET,
%added,
"Our view contains {}, but not in active heads",
&added
);
}
}
for removed in old_view.difference(&state.view) {
// cleanup relay parents we are not interested in any more
let _ = state.per_relay_parent.remove(&removed);
}
}
// Send the difference between two views which were not sent
// to that particular peer.
//
// This requires that there is an entry in the `peer_data` field for the
// peer.
#[overseer::contextbounds(BitfieldDistribution, prefix=self::overseer)]
async fn handle_peer_view_change<Context>(
ctx: &mut Context,
state: &mut ProtocolState,
origin: PeerId,
view: View,
rng: &mut (impl CryptoRng + Rng),
) {
let peer_data = match state.peer_data.get_mut(&origin) {
None => {
gum::warn!(
target: LOG_TARGET,
peer = ?origin,
"Attempted to update peer view for unknown peer."
);
return;
},
Some(pd) => pd,
};
let added = peer_data.view.replace_difference(view).cloned().collect::<Vec<_>>();
let current_session_index = state.topologies.get_current_session_index();
let topology = state.topologies.get_current_topology().local_grid_neighbors();
let is_gossip_peer = topology.route_to_peer(RequiredRouting::GridXY, &origin);
let lucky = is_gossip_peer ||
util::gen_ratio_rng(
util::MIN_GOSSIP_PEERS.saturating_sub(topology.len()),
util::MIN_GOSSIP_PEERS,
rng,
);
if !lucky {
gum::trace!(target: LOG_TARGET, ?origin, "Peer view change is ignored");
return;
}
// Send all messages we've seen before and the peer is now interested
// in to that peer.
let delta_set: Vec<(ValidatorId, BitfieldGossipMessage)> = added
.into_iter()
.filter_map(|new_relay_parent_interest| {
if let Some(job_data) = state
.per_relay_parent
.get(&new_relay_parent_interest)
.filter(|job_data| job_data.signing_context.session_index == current_session_index)
{
// Send all jointly known messages for a validator (given the current relay parent)
// to the peer `origin`...
let one_per_validator = job_data.one_per_validator.clone();
Some(one_per_validator.into_iter().filter(move |(validator, _message)| {
// ..except for the ones the peer already has.
job_data.message_from_validator_needed_by_peer(&origin, validator)
}))
} else {
// A relay parent is in the peers view, which is not in ours, ignore those.
None
}
})
.flatten()
.collect();
for (validator, message) in delta_set.into_iter() {
send_tracked_gossip_message(ctx, state, origin, validator, message).await;
}
}
/// Send a gossip message and track it in the per relay parent data.
#[overseer::contextbounds(BitfieldDistribution, prefix=self::overseer)]
async fn send_tracked_gossip_message<Context>(
ctx: &mut Context,
state: &mut ProtocolState,
dest: PeerId,
validator: ValidatorId,
message: BitfieldGossipMessage,
) {
let job_data = if let Some(job_data) = state.per_relay_parent.get_mut(&message.relay_parent) {
job_data
} else {
return;
};
gum::trace!(
target: LOG_TARGET,
?dest,
?validator,
relay_parent = ?message.relay_parent,
"Sending gossip message"
);
let version =
if let Some(peer_data) = state.peer_data.get(&dest) { peer_data.version } else { return };
job_data.message_sent_to_peer.entry(dest).or_default().insert(validator.clone());
ctx.send_message(NetworkBridgeTxMessage::SendValidationMessage(
vec![dest],
message.into_validation_protocol(version),
))
.await;
}
#[overseer::subsystem(BitfieldDistribution, error=SubsystemError, prefix=self::overseer)]
impl<Context> BitfieldDistribution {
fn start(self, ctx: Context) -> SpawnedSubsystem {
let future = self.run(ctx).map(|_| Ok(())).boxed();
SpawnedSubsystem { name: "bitfield-distribution-subsystem", future }
}
}
/// Query our validator set and signing context for a particular relay parent.
#[overseer::contextbounds(BitfieldDistribution, prefix=self::overseer)]
async fn query_basics<Context>(
ctx: &mut Context,
relay_parent: Hash,
) -> SubsystemResult<Option<(Vec<ValidatorId>, SigningContext)>> {
let (validators_tx, validators_rx) = oneshot::channel();
let (session_tx, session_rx) = oneshot::channel();
// query validators
ctx.send_message(RuntimeApiMessage::Request(
relay_parent,
RuntimeApiRequest::Validators(validators_tx),
))
.await;
// query signing context
ctx.send_message(RuntimeApiMessage::Request(
relay_parent,
RuntimeApiRequest::SessionIndexForChild(session_tx),
))
.await;
match (validators_rx.await?, session_rx.await?) {
(Ok(validators), Ok(session_index)) =>
Ok(Some((validators, SigningContext { parent_hash: relay_parent, session_index }))),
(Err(err), _) | (_, Err(err)) => {
gum::warn!(
target: LOG_TARGET,
?relay_parent,
?err,
"Failed to fetch basics from runtime API"
);
Ok(None)
},
}
}
@@ -0,0 +1,108 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use pezkuwi_node_subsystem_util::metrics::{prometheus, Metrics as MetricsTrait};
#[derive(Clone)]
struct MetricsInner {
sent_own_availability_bitfields: prometheus::Counter<prometheus::U64>,
received_availability_bitfields: prometheus::Counter<prometheus::U64>,
active_leaves_update: prometheus::Histogram,
handle_bitfield_distribution: prometheus::Histogram,
handle_network_msg: prometheus::Histogram,
}
/// Bitfield Distribution metrics.
#[derive(Default, Clone)]
pub struct Metrics(Option<MetricsInner>);
impl Metrics {
pub(crate) fn on_own_bitfield_sent(&self) {
if let Some(metrics) = &self.0 {
metrics.sent_own_availability_bitfields.inc();
}
}
pub(crate) fn on_bitfield_received(&self) {
if let Some(metrics) = &self.0 {
metrics.received_availability_bitfields.inc();
}
}
/// Provide a timer for `active_leaves_update` which observes on drop.
pub(crate) fn time_active_leaves_update(
&self,
) -> Option<prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.active_leaves_update.start_timer())
}
/// Provide a timer for `handle_bitfield_distribution` which observes on drop.
pub(crate) fn time_handle_bitfield_distribution(
&self,
) -> Option<prometheus::prometheus::HistogramTimer> {
self.0
.as_ref()
.map(|metrics| metrics.handle_bitfield_distribution.start_timer())
}
/// Provide a timer for `handle_network_msg` which observes on drop.
pub(crate) fn time_handle_network_msg(&self) -> Option<prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.handle_network_msg.start_timer())
}
}
impl MetricsTrait for Metrics {
fn try_register(registry: &prometheus::Registry) -> Result<Self, prometheus::PrometheusError> {
let metrics = MetricsInner {
sent_own_availability_bitfields: prometheus::register(
prometheus::Counter::new(
"pezkuwi_teyrchain_sent_own_availability_bitfields_total",
"Number of own availability bitfields sent to other peers.",
)?,
registry,
)?,
received_availability_bitfields: prometheus::register(
prometheus::Counter::new(
"pezkuwi_teyrchain_received_availability_bitfields_total",
"Number of valid availability bitfields received from other peers.",
)?,
registry,
)?,
active_leaves_update: prometheus::register(
prometheus::Histogram::with_opts(prometheus::HistogramOpts::new(
"pezkuwi_teyrchain_bitfield_distribution_active_leaves_update",
"Time spent within `bitfield_distribution::active_leaves_update`",
))?,
registry,
)?,
handle_bitfield_distribution: prometheus::register(
prometheus::Histogram::with_opts(prometheus::HistogramOpts::new(
"pezkuwi_teyrchain_bitfield_distribution_handle_bitfield_distribution",
"Time spent within `bitfield_distribution::handle_bitfield_distribution`",
))?,
registry,
)?,
handle_network_msg: prometheus::register(
prometheus::Histogram::with_opts(prometheus::HistogramOpts::new(
"pezkuwi_teyrchain_bitfield_distribution_handle_network_msg",
"Time spent within `bitfield_distribution::handle_network_msg`",
))?,
registry,
)?,
};
Ok(Metrics(Some(metrics)))
}
}
File diff suppressed because it is too large Load Diff
+55
View File
@@ -0,0 +1,55 @@
[package]
name = "pezkuwi-network-bridge"
version = "7.0.0"
description = "The Network Bridge Subsystem — protocol multiplexer for Pezkuwi."
authors.workspace = true
edition.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
[lints]
workspace = true
[dependencies]
always-assert = { workspace = true }
async-trait = { workspace = true }
bytes = { workspace = true, default-features = true }
codec = { features = ["derive"], workspace = true }
fatality = { workspace = true }
futures = { workspace = true }
gum = { workspace = true, default-features = true }
parking_lot = { workspace = true, default-features = true }
pezkuwi-node-metrics = { workspace = true, default-features = true }
pezkuwi-node-network-protocol = { workspace = true, default-features = true }
pezkuwi-node-subsystem = { workspace = true, default-features = true }
pezkuwi-overseer = { workspace = true, default-features = true }
pezkuwi-primitives = { workspace = true, default-features = true }
sc-network = { workspace = true, default-features = true }
sp-consensus = { workspace = true, default-features = true }
thiserror = { workspace = true }
[dev-dependencies]
assert_matches = { workspace = true }
futures-timer = { workspace = true }
pezkuwi-node-subsystem-test-helpers = { workspace = true }
pezkuwi-node-subsystem-util = { workspace = true, default-features = true }
pezkuwi-primitives-test-helpers = { workspace = true }
sp-core = { workspace = true, default-features = true }
sp-keyring = { workspace = true, default-features = true }
[features]
runtime-benchmarks = [
"gum/runtime-benchmarks",
"pezkuwi-node-metrics/runtime-benchmarks",
"pezkuwi-node-network-protocol/runtime-benchmarks",
"pezkuwi-node-subsystem-test-helpers/runtime-benchmarks",
"pezkuwi-node-subsystem-util/runtime-benchmarks",
"pezkuwi-node-subsystem/runtime-benchmarks",
"pezkuwi-overseer/runtime-benchmarks",
"pezkuwi-primitives-test-helpers/runtime-benchmarks",
"pezkuwi-primitives/runtime-benchmarks",
"sc-network/runtime-benchmarks",
"sp-consensus/runtime-benchmarks",
"sp-keyring/runtime-benchmarks",
]
+36
View File
@@ -0,0 +1,36 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use pezkuwi_node_subsystem::SubsystemError;
pub(crate) use pezkuwi_overseer::OverseerError;
#[fatality::fatality(splitable)]
pub(crate) enum Error {
/// Received error from overseer:
#[fatal]
#[error(transparent)]
SubsystemError(#[from] SubsystemError),
/// The stream of incoming events concluded.
#[fatal]
#[error("Event stream closed unexpectedly")]
EventStreamConcluded,
}
impl From<OverseerError> for Error {
fn from(e: OverseerError) -> Self {
Error::SubsystemError(SubsystemError::from(e))
}
}
+132
View File
@@ -0,0 +1,132 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//! The Network Bridge Subsystem - protocol multiplexer for Pezkuwi.
//!
//! Split into incoming (`..In`) and outgoing (`..Out`) subsystems.
#![deny(unused_crate_dependencies)]
#![warn(missing_docs)]
use codec::{Decode, Encode};
use futures::prelude::*;
use parking_lot::Mutex;
use sp_consensus::SyncOracle;
use pezkuwi_node_network_protocol::{
peer_set::{PeerSet, ProtocolVersion},
PeerId, UnifiedReputationChange as Rep, View,
};
/// Peer set info for network initialization.
///
/// To be passed to [`FullNetworkConfiguration::add_notification_protocol`]().
pub use pezkuwi_node_network_protocol::peer_set::{peer_sets_info, IsAuthority};
use std::{collections::HashMap, sync::Arc};
mod validator_discovery;
/// Actual interfacing to the network based on the `Network` trait.
///
/// Defines the `Network` trait with an implementation for an `Arc<NetworkService>`.
mod network;
use self::network::Network;
mod metrics;
pub use self::metrics::Metrics;
mod errors;
pub(crate) use self::errors::Error;
mod tx;
pub use self::tx::*;
mod rx;
pub use self::rx::*;
/// The maximum amount of heads a peer is allowed to have in their view at any time.
///
/// We use the same limit to compute the view sent to peers locally.
pub(crate) const MAX_VIEW_HEADS: usize = 5;
pub(crate) const MALFORMED_MESSAGE_COST: Rep = Rep::CostMajor("Malformed Network-bridge message");
pub(crate) const UNCONNECTED_PEERSET_COST: Rep =
Rep::CostMinor("Message sent to un-connected peer-set");
pub(crate) const MALFORMED_VIEW_COST: Rep = Rep::CostMajor("Malformed view");
pub(crate) const EMPTY_VIEW_COST: Rep = Rep::CostMajor("Peer sent us an empty view");
/// Messages from and to the network.
///
/// As transmitted to and received from subsystems.
#[derive(Debug, Encode, Decode, Clone)]
pub(crate) enum WireMessage<M> {
/// A message from a peer on a specific protocol.
#[codec(index = 1)]
ProtocolMessage(M),
/// A view update from a peer.
#[codec(index = 2)]
ViewUpdate(View),
}
#[derive(Debug)]
pub(crate) struct PeerData {
/// The Latest view sent by the peer.
view: View,
version: ProtocolVersion,
}
/// Shared state between incoming and outgoing.
#[derive(Default, Clone)]
pub(crate) struct Shared(Arc<Mutex<SharedInner>>);
#[derive(Default)]
struct SharedInner {
local_view: Option<View>,
validation_peers: HashMap<PeerId, PeerData>,
collation_peers: HashMap<PeerId, PeerData>,
}
// Counts the number of peers that are connectioned using `version`
fn count_peers_by_version(peers: &HashMap<PeerId, PeerData>) -> HashMap<ProtocolVersion, usize> {
let mut by_version_count = HashMap::new();
for peer in peers.values() {
*(by_version_count.entry(peer.version).or_default()) += 1;
}
by_version_count
}
// Notes the peer count
fn note_peers_count(metrics: &Metrics, shared: &Shared) {
let guard = shared.0.lock();
let validation_stats = count_peers_by_version(&guard.validation_peers);
let collation_stats = count_peers_by_version(&guard.collation_peers);
for (version, count) in validation_stats {
metrics.note_peer_count(PeerSet::Validation, version, count)
}
for (version, count) in collation_stats {
metrics.note_peer_count(PeerSet::Collation, version, count)
}
}
pub(crate) enum Mode {
Syncing(Box<dyn SyncOracle + Send>),
Active,
}
+287
View File
@@ -0,0 +1,287 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use super::{PeerSet, ProtocolVersion};
use pezkuwi_node_metrics::metrics::{self, prometheus};
/// Metrics for the network bridge.
#[derive(Clone, Default)]
pub struct Metrics(pub(crate) Option<MetricsInner>);
fn peer_set_label(peer_set: PeerSet, version: ProtocolVersion) -> &'static str {
// Higher level code is meant to protect against this ever happening.
peer_set.get_protocol_label(version).unwrap_or("<internal error>")
}
#[allow(missing_docs)]
impl Metrics {
pub fn on_peer_connected(&self, peer_set: PeerSet, version: ProtocolVersion) {
self.0.as_ref().map(|metrics| {
metrics
.connected_events
.with_label_values(&[peer_set_label(peer_set, version)])
.inc()
});
}
pub fn on_peer_disconnected(&self, peer_set: PeerSet, version: ProtocolVersion) {
self.0.as_ref().map(|metrics| {
metrics
.disconnected_events
.with_label_values(&[peer_set_label(peer_set, version)])
.inc()
});
}
pub fn note_peer_count(&self, peer_set: PeerSet, version: ProtocolVersion, count: usize) {
if let Some(metrics) = self.0.as_ref() {
let label = peer_set_label(peer_set, version);
metrics.peer_count.with_label_values(&[label]).set(count as u64);
metrics.peer_connectivity.with_label_values(&[label]).observe(count as f64);
}
}
pub fn on_notification_received(
&self,
peer_set: PeerSet,
version: ProtocolVersion,
size: usize,
) {
if let Some(metrics) = self.0.as_ref() {
metrics
.notifications_received
.with_label_values(&[peer_set_label(peer_set, version)])
.inc();
metrics
.bytes_received
.with_label_values(&[peer_set_label(peer_set, version)])
.inc_by(size as u64);
}
}
pub fn on_notification_sent(
&self,
peer_set: PeerSet,
version: ProtocolVersion,
size: usize,
to_peers: usize,
) {
if let Some(metrics) = self.0.as_ref() {
metrics
.notifications_sent
.with_label_values(&[peer_set_label(peer_set, version)])
.inc_by(to_peers as u64);
metrics
.bytes_sent
.with_label_values(&[peer_set_label(peer_set, version)])
.inc_by((size * to_peers) as u64);
}
}
pub fn note_desired_peer_count(&self, peer_set: PeerSet, size: usize) {
self.0.as_ref().map(|metrics| {
metrics
.desired_peer_count
.with_label_values(&[peer_set.get_label()])
.set(size as u64)
});
}
pub fn on_report_event(&self) {
if let Some(metrics) = self.0.as_ref() {
self.on_message("report_peer");
metrics.report_events.inc()
}
}
pub fn on_message(&self, message_type: &'static str) {
if let Some(metrics) = self.0.as_ref() {
metrics.messages_sent.with_label_values(&[message_type]).inc()
}
}
pub fn on_delayed_rx_queue(&self, queue_size: usize) {
if let Some(metrics) = self.0.as_ref() {
metrics.rx_delayed_processing.observe(queue_size as f64);
}
}
pub fn time_delayed_rx_events(
&self,
) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.rx_delayed_processing_time.start_timer())
}
}
#[derive(Clone)]
pub(crate) struct MetricsInner {
peer_count: prometheus::GaugeVec<prometheus::U64>,
peer_connectivity: prometheus::HistogramVec,
connected_events: prometheus::CounterVec<prometheus::U64>,
disconnected_events: prometheus::CounterVec<prometheus::U64>,
desired_peer_count: prometheus::GaugeVec<prometheus::U64>,
report_events: prometheus::Counter<prometheus::U64>,
notifications_received: prometheus::CounterVec<prometheus::U64>,
notifications_sent: prometheus::CounterVec<prometheus::U64>,
bytes_received: prometheus::CounterVec<prometheus::U64>,
bytes_sent: prometheus::CounterVec<prometheus::U64>,
messages_sent: prometheus::CounterVec<prometheus::U64>,
// The reason why a `Histogram` is used to track a queue size is that
// we need not only an average size of the queue (that will be 0 normally), but
// we also need a dynamics for this queue size in case of messages delays.
rx_delayed_processing: prometheus::Histogram,
rx_delayed_processing_time: prometheus::Histogram,
}
impl metrics::Metrics for Metrics {
fn try_register(
registry: &prometheus::Registry,
) -> std::result::Result<Self, prometheus::PrometheusError> {
let metrics = MetricsInner {
peer_count: prometheus::register(
prometheus::GaugeVec::new(
prometheus::Opts::new(
"pezkuwi_teyrchain_peer_count",
"The number of peers on a teyrchain-related peer-set",
),
&["protocol"]
)?,
registry,
)?,
peer_connectivity: prometheus::register(
prometheus::HistogramVec::new(
prometheus::HistogramOpts::new(
"pezkuwi_teyrchain_peer_connectivity",
"Histogram of peer counts on a teyrchain-related peer-set to track connectivity patterns",
).buckets(vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 15.0, 20.0, 25.0, 30.0, 40.0, 50.0, 100.0, 250.0, 500.0, 1000.0]),
&["protocol"]
)?,
registry,
)?,
connected_events: prometheus::register(
prometheus::CounterVec::new(
prometheus::Opts::new(
"pezkuwi_teyrchain_peer_connect_events_total",
"The number of peer connect events on a teyrchain notifications protocol",
),
&["protocol"]
)?,
registry,
)?,
disconnected_events: prometheus::register(
prometheus::CounterVec::new(
prometheus::Opts::new(
"pezkuwi_teyrchain_peer_disconnect_events_total",
"The number of peer disconnect events on a teyrchain notifications protocol",
),
&["protocol"]
)?,
registry,
)?,
desired_peer_count: prometheus::register(
prometheus::GaugeVec::new(
prometheus::Opts::new(
"pezkuwi_teyrchain_desired_peer_count",
"The number of peers that the local node is expected to connect to on a teyrchain-related peer-set (either including or not including unresolvable authorities, depending on whether `ConnectToValidators` or `ConnectToValidatorsResolved` was used.)",
),
&["protocol"]
)?,
registry,
)?,
report_events: prometheus::register(
prometheus::Counter::new(
"pezkuwi_teyrchain_network_report_events_total",
"The amount of reputation changes issued by subsystems",
)?,
registry,
)?,
notifications_received: prometheus::register(
prometheus::CounterVec::new(
prometheus::Opts::new(
"pezkuwi_teyrchain_notifications_received_total",
"The number of notifications received on a teyrchain protocol",
),
&["protocol"]
)?,
registry,
)?,
notifications_sent: prometheus::register(
prometheus::CounterVec::new(
prometheus::Opts::new(
"pezkuwi_teyrchain_notifications_sent_total",
"The number of notifications sent on a teyrchain protocol",
),
&["protocol"]
)?,
registry,
)?,
bytes_received: prometheus::register(
prometheus::CounterVec::new(
prometheus::Opts::new(
"pezkuwi_teyrchain_notification_bytes_received_total",
"The number of bytes received on a teyrchain notification protocol",
),
&["protocol"]
)?,
registry,
)?,
bytes_sent: prometheus::register(
prometheus::CounterVec::new(
prometheus::Opts::new(
"pezkuwi_teyrchain_notification_bytes_sent_total",
"The number of bytes sent on a teyrchain notification protocol",
),
&["protocol"]
)?,
registry,
)?,
messages_sent: prometheus::register(
prometheus::CounterVec::new(
prometheus::Opts::new(
"pezkuwi_teyrchain_messages_sent_total",
"The number of messages sent via network bridge",
),
&["type"]
)?,
registry,
)?,
rx_delayed_processing: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"pezkuwi_teyrchain_network_bridge_rx_delayed",
"Number of events being delayed while broadcasting from the network bridge",
).buckets(vec![0.0, 1.0, 2.0, 8.0, 16.0]),
)?,
registry,
)?,
rx_delayed_processing_time: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"pezkuwi_teyrchain_network_bridge_rx_delayed_time",
"Time spent for waiting of the delayed events",
),
)?,
registry,
)?,
};
Ok(Metrics(Some(metrics)))
}
}
+329
View File
@@ -0,0 +1,329 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use std::{
collections::{HashMap, HashSet},
sync::Arc,
};
use async_trait::async_trait;
use parking_lot::Mutex;
use codec::Encode;
use sc_network::{
config::parse_addr, multiaddr::Multiaddr, service::traits::NetworkService, types::ProtocolName,
IfDisconnected, MessageSink, OutboundFailure, ReputationChange, RequestFailure,
};
use pezkuwi_node_network_protocol::{
peer_set::{CollationVersion, PeerSet, ProtocolVersion, ValidationVersion},
request_response::{OutgoingRequest, Recipient, ReqProtocolNames, Requests},
v1 as protocol_v1, v2 as protocol_v2, v3 as protocol_v3, PeerId,
};
use pezkuwi_primitives::AuthorityDiscoveryId;
use crate::{metrics::Metrics, validator_discovery::AuthorityDiscovery, WireMessage};
// network bridge network abstraction log target
const LOG_TARGET: &'static str = "teyrchain::network-bridge-net";
// Helper function to send a validation v3 message to a list of peers.
// Messages are always sent via the main protocol, even legacy protocol messages.
pub(crate) fn send_validation_message_v3(
peers: Vec<PeerId>,
message: WireMessage<protocol_v3::ValidationProtocol>,
metrics: &Metrics,
notification_sinks: &Arc<Mutex<HashMap<(PeerSet, PeerId), Box<dyn MessageSink>>>>,
) {
gum::trace!(target: LOG_TARGET, ?peers, ?message, "Sending validation v3 message to peers",);
send_message(
peers,
PeerSet::Validation,
ValidationVersion::V3.into(),
message,
metrics,
notification_sinks,
);
}
// Helper function to send a collation v1 message to a list of peers.
// Messages are always sent via the main protocol, even legacy protocol messages.
pub(crate) fn send_collation_message_v1(
peers: Vec<PeerId>,
message: WireMessage<protocol_v1::CollationProtocol>,
metrics: &Metrics,
notification_sinks: &Arc<Mutex<HashMap<(PeerSet, PeerId), Box<dyn MessageSink>>>>,
) {
send_message(
peers,
PeerSet::Collation,
CollationVersion::V1.into(),
message,
metrics,
notification_sinks,
);
}
// Helper function to send a collation v2 message to a list of peers.
// Messages are always sent via the main protocol, even legacy protocol messages.
pub(crate) fn send_collation_message_v2(
peers: Vec<PeerId>,
message: WireMessage<protocol_v2::CollationProtocol>,
metrics: &Metrics,
notification_sinks: &Arc<Mutex<HashMap<(PeerSet, PeerId), Box<dyn MessageSink>>>>,
) {
send_message(
peers,
PeerSet::Collation,
CollationVersion::V2.into(),
message,
metrics,
notification_sinks,
);
}
/// Lower level function that sends a message to the network using the main protocol version.
///
/// This function is only used internally by the network-bridge, which is responsible to only send
/// messages that are compatible with the passed peer set, as that is currently not enforced by
/// this function. These are messages of type `WireMessage` parameterized on the matching type.
fn send_message<M>(
mut peers: Vec<PeerId>,
peer_set: PeerSet,
version: ProtocolVersion,
message: M,
metrics: &super::Metrics,
network_notification_sinks: &Arc<Mutex<HashMap<(PeerSet, PeerId), Box<dyn MessageSink>>>>,
) where
M: Encode + Clone,
{
if peers.is_empty() {
return;
}
let message = {
let encoded = message.encode();
metrics.on_notification_sent(peer_set, version, encoded.len(), peers.len());
metrics.on_message(std::any::type_name::<M>());
encoded
};
let notification_sinks = network_notification_sinks.lock();
gum::trace!(
target: LOG_TARGET,
?peers,
?peer_set,
?version,
?message,
"Sending message to peers",
);
// optimization: avoid cloning the message for the last peer in the
// list. The message payload can be quite large. If the underlying
// network used `Bytes` this would not be necessary.
//
// peer may have gotten disconnect by the time `send_message()` is called
// at which point the sink is not available.
let last_peer = peers.pop();
peers.into_iter().for_each(|peer| {
if let Some(sink) = notification_sinks.get(&(peer_set, peer)) {
sink.send_sync_notification(message.clone());
}
});
if let Some(peer) = last_peer {
if let Some(sink) = notification_sinks.get(&(peer_set, peer)) {
sink.send_sync_notification(message.clone());
}
}
}
/// An abstraction over networking for the purposes of this subsystem.
#[async_trait]
pub trait Network: Clone + Send + 'static {
/// Ask the network to keep a substream open with these nodes and not disconnect from them
/// until removed from the protocol's peer set.
/// Note that `out_peers` setting has no effect on this.
async fn set_reserved_peers(
&mut self,
protocol: ProtocolName,
multiaddresses: HashSet<Multiaddr>,
) -> Result<(), String>;
/// Ask the network to extend the reserved set with these nodes.
async fn add_peers_to_reserved_set(
&mut self,
protocol: ProtocolName,
multiaddresses: HashSet<Multiaddr>,
) -> Result<(), String>;
/// Removes the peers for the protocol's peer set (both reserved and non-reserved).
async fn remove_from_peers_set(
&mut self,
protocol: ProtocolName,
peers: Vec<PeerId>,
) -> Result<(), String>;
/// Send a request to a remote peer.
async fn start_request<AD: AuthorityDiscovery>(
&self,
authority_discovery: &mut AD,
req: Requests,
req_protocol_names: &ReqProtocolNames,
if_disconnected: IfDisconnected,
);
/// Report a given peer as either beneficial (+) or costly (-) according to the given scalar.
fn report_peer(&self, who: PeerId, rep: ReputationChange);
/// Disconnect a given peer from the protocol specified without harming reputation.
fn disconnect_peer(&self, who: PeerId, protocol: ProtocolName);
/// Get peer role.
fn peer_role(&self, who: PeerId, handshake: Vec<u8>) -> Option<sc_network::ObservedRole>;
}
#[async_trait]
impl Network for Arc<dyn NetworkService> {
async fn set_reserved_peers(
&mut self,
protocol: ProtocolName,
multiaddresses: HashSet<Multiaddr>,
) -> Result<(), String> {
<dyn NetworkService>::set_reserved_peers(&**self, protocol, multiaddresses)
}
async fn add_peers_to_reserved_set(
&mut self,
protocol: ProtocolName,
multiaddresses: HashSet<Multiaddr>,
) -> Result<(), String> {
<dyn NetworkService>::add_peers_to_reserved_set(&**self, protocol, multiaddresses)
}
async fn remove_from_peers_set(
&mut self,
protocol: ProtocolName,
peers: Vec<PeerId>,
) -> Result<(), String> {
<dyn NetworkService>::remove_peers_from_reserved_set(&**self, protocol, peers)
}
fn report_peer(&self, who: PeerId, rep: ReputationChange) {
<dyn NetworkService>::report_peer(&**self, who, rep);
}
fn disconnect_peer(&self, who: PeerId, protocol: ProtocolName) {
<dyn NetworkService>::disconnect_peer(&**self, who, protocol);
}
async fn start_request<AD: AuthorityDiscovery>(
&self,
authority_discovery: &mut AD,
req: Requests,
req_protocol_names: &ReqProtocolNames,
if_disconnected: IfDisconnected,
) {
let (protocol, OutgoingRequest { peer, payload, pending_response, fallback_request }) =
req.encode_request();
let peer_id = match peer {
Recipient::Peer(peer_id) => Some(peer_id),
Recipient::Authority(authority) => {
gum::trace!(
target: LOG_TARGET,
?authority,
"Searching for peer id to connect to authority",
);
let mut found_peer_id = None;
// Note: `get_addresses_by_authority_id` searched in a cache, and it thus expected
// to be very quick.
for addr in authority_discovery
.get_addresses_by_authority_id(authority)
.await
.into_iter()
.flat_map(|list| list.into_iter())
{
let (peer_id, addr) = match parse_addr(addr) {
Ok(v) => v,
Err(_) => continue,
};
<dyn NetworkService>::add_known_address(&**self, peer_id, addr);
found_peer_id = Some(peer_id);
}
found_peer_id
},
};
let peer_id = match peer_id {
None => {
gum::debug!(target: LOG_TARGET, "Discovering authority failed");
match pending_response
.send(Err(RequestFailure::Network(OutboundFailure::DialFailure)))
{
Err(_) => {
gum::debug!(target: LOG_TARGET, "Sending failed request response failed.")
},
Ok(_) => {},
}
return;
},
Some(peer_id) => peer_id,
};
gum::trace!(
target: LOG_TARGET,
%peer_id,
protocol = %req_protocol_names.get_name(protocol),
fallback_protocol = ?fallback_request.as_ref().map(|(_, p)| req_protocol_names.get_name(*p)),
?if_disconnected,
"Starting request",
);
<dyn NetworkService>::start_request(
&**self,
peer_id,
req_protocol_names.get_name(protocol),
payload,
fallback_request.map(|(r, p)| (r, req_protocol_names.get_name(p))),
pending_response,
if_disconnected,
);
}
fn peer_role(&self, who: PeerId, handshake: Vec<u8>) -> Option<sc_network::ObservedRole> {
<dyn NetworkService>::peer_role(&**self, who, handshake)
}
}
/// We assume one `peer_id` per `authority_id`.
pub async fn get_peer_id_by_authority_id<AD: AuthorityDiscovery>(
authority_discovery: &mut AD,
authority: AuthorityDiscoveryId,
) -> Option<PeerId> {
// Note: `get_addresses_by_authority_id` searched in a cache, and it thus expected
// to be very quick.
authority_discovery
.get_addresses_by_authority_id(authority)
.await
.into_iter()
.flat_map(|list| list.into_iter())
.find_map(|addr| parse_addr(addr).ok().map(|(p, _)| p))
}
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+400
View File
@@ -0,0 +1,400 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//! The Network Bridge Subsystem - handles _outgoing_ messages, from subsystem to the network.
use super::*;
use pezkuwi_node_network_protocol::{
peer_set::PeerSetProtocolNames, request_response::ReqProtocolNames, CollationProtocols,
ValidationProtocols,
};
use pezkuwi_node_subsystem::{
errors::SubsystemError,
messages::{NetworkBridgeTxMessage, ReportPeerMessage},
overseer, FromOrchestra, OverseerSignal, SpawnedSubsystem,
};
use pezkuwi_node_network_protocol::request_response::Requests;
use sc_network::{MessageSink, ReputationChange};
use crate::validator_discovery;
/// Actual interfacing to the network based on the `Network` trait.
///
/// Defines the `Network` trait with an implementation for an `Arc<NetworkService>`.
use crate::network::{
send_collation_message_v1, send_collation_message_v2, send_validation_message_v3, Network,
};
use crate::metrics::Metrics;
#[cfg(test)]
mod tests;
// network bridge log target
const LOG_TARGET: &'static str = "teyrchain::network-bridge-tx";
/// The network bridge subsystem.
pub struct NetworkBridgeTx<N, AD> {
/// `Network` trait implementing type.
network_service: N,
authority_discovery_service: AD,
metrics: Metrics,
req_protocol_names: ReqProtocolNames,
peerset_protocol_names: PeerSetProtocolNames,
notification_sinks: Arc<Mutex<HashMap<(PeerSet, PeerId), Box<dyn MessageSink>>>>,
}
impl<N, AD> NetworkBridgeTx<N, AD> {
/// Create a new network bridge subsystem with underlying network service and authority
/// discovery service.
///
/// This assumes that the network service has had the notifications protocol for the network
/// bridge already registered. See [`peer_sets_info`].
pub fn new(
network_service: N,
authority_discovery_service: AD,
metrics: Metrics,
req_protocol_names: ReqProtocolNames,
peerset_protocol_names: PeerSetProtocolNames,
notification_sinks: Arc<Mutex<HashMap<(PeerSet, PeerId), Box<dyn MessageSink>>>>,
) -> Self {
Self {
network_service,
authority_discovery_service,
metrics,
req_protocol_names,
peerset_protocol_names,
notification_sinks,
}
}
}
#[overseer::subsystem(NetworkBridgeTx, error = SubsystemError, prefix = self::overseer)]
impl<Net, AD, Context> NetworkBridgeTx<Net, AD>
where
Net: Network + Sync,
AD: validator_discovery::AuthorityDiscovery + Clone + Sync,
{
fn start(self, ctx: Context) -> SpawnedSubsystem {
let future = run_network_out(self, ctx)
.map_err(|e| SubsystemError::with_origin("network-bridge", e))
.boxed();
SpawnedSubsystem { name: "network-bridge-tx-subsystem", future }
}
}
#[overseer::contextbounds(NetworkBridgeTx, prefix = self::overseer)]
async fn handle_subsystem_messages<Context, N, AD>(
mut ctx: Context,
mut network_service: N,
mut authority_discovery_service: AD,
metrics: Metrics,
req_protocol_names: ReqProtocolNames,
peerset_protocol_names: PeerSetProtocolNames,
notification_sinks: Arc<Mutex<HashMap<(PeerSet, PeerId), Box<dyn MessageSink>>>>,
) -> Result<(), Error>
where
N: Network,
AD: validator_discovery::AuthorityDiscovery + Clone,
{
let mut validator_discovery =
validator_discovery::Service::<N, AD>::new(peerset_protocol_names.clone());
loop {
match ctx.recv().fuse().await? {
FromOrchestra::Signal(OverseerSignal::Conclude) => return Ok(()),
FromOrchestra::Signal(_) => { /* handled by incoming */ },
FromOrchestra::Communication { msg } => {
(network_service, authority_discovery_service) =
handle_incoming_subsystem_communication(
&mut ctx,
network_service,
&mut validator_discovery,
authority_discovery_service.clone(),
msg,
&metrics,
&req_protocol_names,
&peerset_protocol_names,
&notification_sinks,
)
.await;
},
}
}
}
#[overseer::contextbounds(NetworkBridgeTx, prefix = self::overseer)]
async fn handle_incoming_subsystem_communication<Context, N, AD>(
_ctx: &mut Context,
network_service: N,
validator_discovery: &mut validator_discovery::Service<N, AD>,
mut authority_discovery_service: AD,
msg: NetworkBridgeTxMessage,
metrics: &Metrics,
req_protocol_names: &ReqProtocolNames,
peerset_protocol_names: &PeerSetProtocolNames,
notification_sinks: &Arc<Mutex<HashMap<(PeerSet, PeerId), Box<dyn MessageSink>>>>,
) -> (N, AD)
where
N: Network,
AD: validator_discovery::AuthorityDiscovery + Clone,
{
match msg {
NetworkBridgeTxMessage::ReportPeer(ReportPeerMessage::Single(peer, rep)) => {
if !rep.value.is_positive() {
gum::debug!(target: LOG_TARGET, ?peer, ?rep, action = "ReportPeer");
}
metrics.on_report_event();
network_service.report_peer(peer, rep);
},
NetworkBridgeTxMessage::ReportPeer(ReportPeerMessage::Batch(batch)) => {
for (peer, score) in batch {
let rep = ReputationChange::new(score, "Aggregated reputation change");
if !rep.value.is_positive() {
gum::debug!(target: LOG_TARGET, ?peer, ?rep, action = "ReportPeer");
}
metrics.on_report_event();
network_service.report_peer(peer, rep);
}
},
NetworkBridgeTxMessage::DisconnectPeers(peers, peer_set) => {
gum::trace!(
target: LOG_TARGET,
action = "DisconnectPeers",
?peers,
peer_set = ?peer_set,
);
// [`NetworkService`] keeps track of the protocols by their main name.
let protocol = peerset_protocol_names.get_main_name(peer_set);
for peer in peers {
network_service.disconnect_peer(peer, protocol.clone());
}
},
NetworkBridgeTxMessage::SendValidationMessage(peers, msg) => {
gum::trace!(
target: LOG_TARGET,
action = "SendValidationMessages",
?msg,
num_messages = 1usize,
);
match msg {
ValidationProtocols::V3(msg) => send_validation_message_v3(
peers,
WireMessage::ProtocolMessage(msg),
&metrics,
notification_sinks,
),
}
},
NetworkBridgeTxMessage::SendValidationMessages(msgs) => {
gum::trace!(
target: LOG_TARGET,
action = "SendValidationMessages",
num_messages = %msgs.len(),
?msgs,
);
for (peers, msg) in msgs {
match msg {
ValidationProtocols::V3(msg) => send_validation_message_v3(
peers,
WireMessage::ProtocolMessage(msg),
&metrics,
notification_sinks,
),
}
}
},
NetworkBridgeTxMessage::SendCollationMessage(peers, msg) => {
gum::trace!(
target: LOG_TARGET,
action = "SendCollationMessages",
num_messages = 1usize,
);
match msg {
CollationProtocols::V1(msg) => send_collation_message_v1(
peers,
WireMessage::ProtocolMessage(msg),
&metrics,
notification_sinks,
),
CollationProtocols::V2(msg) => send_collation_message_v2(
peers,
WireMessage::ProtocolMessage(msg),
&metrics,
notification_sinks,
),
}
},
NetworkBridgeTxMessage::SendCollationMessages(msgs) => {
gum::trace!(
target: LOG_TARGET,
action = "SendCollationMessages",
num_messages = %msgs.len(),
);
for (peers, msg) in msgs {
match msg {
CollationProtocols::V1(msg) => send_collation_message_v1(
peers,
WireMessage::ProtocolMessage(msg),
&metrics,
notification_sinks,
),
CollationProtocols::V2(msg) => send_collation_message_v2(
peers,
WireMessage::ProtocolMessage(msg),
&metrics,
notification_sinks,
),
}
}
},
NetworkBridgeTxMessage::SendRequests(reqs, if_disconnected) => {
gum::trace!(
target: LOG_TARGET,
action = "SendRequests",
num_requests = %reqs.len(),
);
for req in reqs {
match req {
Requests::ChunkFetching(ref req) => {
// This is not the actual request that will succeed, as we don't know yet
// what that will be. It's only the primary request we tried.
if req.fallback_request.is_some() {
metrics.on_message("chunk_fetching_v2")
} else {
metrics.on_message("chunk_fetching_v1")
}
},
Requests::AvailableDataFetchingV1(_) =>
metrics.on_message("available_data_fetching_v1"),
Requests::CollationFetchingV1(_) => metrics.on_message("collation_fetching_v1"),
Requests::CollationFetchingV2(_) => metrics.on_message("collation_fetching_v2"),
Requests::PoVFetchingV1(_) => metrics.on_message("pov_fetching_v1"),
Requests::DisputeSendingV1(_) => metrics.on_message("dispute_sending_v1"),
Requests::AttestedCandidateV2(_) => metrics.on_message("attested_candidate_v2"),
}
network_service
.start_request(
&mut authority_discovery_service,
req,
req_protocol_names,
if_disconnected,
)
.await;
}
},
NetworkBridgeTxMessage::ConnectToValidators { validator_ids, peer_set, failed } => {
gum::trace!(
target: LOG_TARGET,
action = "ConnectToValidators",
peer_set = ?peer_set,
ids = ?validator_ids,
"Received a validator connection request",
);
metrics.note_desired_peer_count(peer_set, validator_ids.len());
let (network_service, ads) = validator_discovery
.on_request(
validator_ids,
peer_set,
failed,
network_service,
authority_discovery_service,
)
.await;
return (network_service, ads);
},
NetworkBridgeTxMessage::ConnectToResolvedValidators { validator_addrs, peer_set } => {
gum::trace!(
target: LOG_TARGET,
action = "ConnectToPeers",
peer_set = ?peer_set,
?validator_addrs,
"Received a resolved validator connection request",
);
metrics.note_desired_peer_count(peer_set, validator_addrs.len());
let all_addrs = validator_addrs.into_iter().flatten().collect();
let network_service = validator_discovery
.on_resolved_request(all_addrs, peer_set, network_service)
.await;
return (network_service, authority_discovery_service);
},
NetworkBridgeTxMessage::AddToResolvedValidators { validator_addrs, peer_set } => {
gum::trace!(
target: LOG_TARGET,
action = "AddToResolvedValidators",
peer_set = ?peer_set,
?validator_addrs,
"Received a resolved validator connection request",
);
let all_addrs = validator_addrs.into_iter().flatten().collect();
let network_service = validator_discovery
.on_add_to_resolved_request(all_addrs, peer_set, network_service)
.await;
return (network_service, authority_discovery_service);
},
}
(network_service, authority_discovery_service)
}
#[overseer::contextbounds(NetworkBridgeTx, prefix = self::overseer)]
async fn run_network_out<N, AD, Context>(
bridge: NetworkBridgeTx<N, AD>,
ctx: Context,
) -> Result<(), Error>
where
N: Network,
AD: validator_discovery::AuthorityDiscovery + Clone + Sync,
{
let NetworkBridgeTx {
network_service,
authority_discovery_service,
metrics,
req_protocol_names,
peerset_protocol_names,
notification_sinks,
} = bridge;
handle_subsystem_messages(
ctx,
network_service,
authority_discovery_service,
metrics,
req_protocol_names,
peerset_protocol_names,
notification_sinks,
)
.await?;
Ok(())
}
+371
View File
@@ -0,0 +1,371 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use super::*;
use futures::executor;
use pezkuwi_node_subsystem_util::TimeoutExt;
use async_trait::async_trait;
use parking_lot::Mutex;
use std::collections::HashSet;
use sc_network::{
IfDisconnected, ObservedRole as SubstrateObservedRole, ProtocolName, ReputationChange, Roles,
};
use codec::DecodeAll;
use pezkuwi_node_network_protocol::{
peer_set::PeerSetProtocolNames,
request_response::{outgoing::Requests, ReqProtocolNames},
v1 as protocol_v1, v3 as protocol_v3, CollationProtocols, ObservedRole, ValidationProtocols,
};
use pezkuwi_node_subsystem::{FromOrchestra, OverseerSignal};
use pezkuwi_node_subsystem_test_helpers::TestSubsystemContextHandle;
use pezkuwi_node_subsystem_util::metered;
use pezkuwi_primitives::{AuthorityDiscoveryId, Hash};
use pezkuwi_primitives_test_helpers::dummy_collator_signature;
use sc_network::Multiaddr;
use sp_keyring::Sr25519Keyring;
const TIMEOUT: std::time::Duration = pezkuwi_node_subsystem_test_helpers::TestSubsystemContextHandle::<NetworkBridgeTxMessage>::TIMEOUT;
use crate::{network::Network, validator_discovery::AuthorityDiscovery};
#[derive(Debug, PartialEq)]
pub enum NetworkAction {
/// Note a change in reputation for a peer.
ReputationChange(PeerId, ReputationChange),
/// Disconnect a peer from the given peer-set.
DisconnectPeer(PeerId, PeerSet),
/// Write a notification to a given peer on the given peer-set.
WriteNotification(PeerId, PeerSet, Vec<u8>),
}
// The subsystem's view of the network.
#[derive(Clone)]
struct TestNetwork {
action_tx: Arc<Mutex<metered::UnboundedMeteredSender<NetworkAction>>>,
peerset_protocol_names: Arc<PeerSetProtocolNames>,
}
#[derive(Clone, Debug)]
struct TestAuthorityDiscovery;
// The test's view of the network. This receives updates from the subsystem in the form
// of `NetworkAction`s.
struct TestNetworkHandle {
action_rx: metered::UnboundedMeteredReceiver<NetworkAction>,
_peerset_protocol_names: PeerSetProtocolNames,
notification_sinks: Arc<Mutex<HashMap<(PeerSet, PeerId), Box<dyn MessageSink>>>>,
action_tx: Arc<Mutex<metered::UnboundedMeteredSender<NetworkAction>>>,
}
struct TestMessageSink {
peer: PeerId,
peer_set: PeerSet,
action_tx: Arc<Mutex<metered::UnboundedMeteredSender<NetworkAction>>>,
}
impl TestMessageSink {
fn new(
peer: PeerId,
peer_set: PeerSet,
action_tx: Arc<Mutex<metered::UnboundedMeteredSender<NetworkAction>>>,
) -> TestMessageSink {
Self { peer, peer_set, action_tx }
}
}
#[async_trait::async_trait]
impl MessageSink for TestMessageSink {
fn send_sync_notification(&self, notification: Vec<u8>) {
self.action_tx
.lock()
.unbounded_send(NetworkAction::WriteNotification(
self.peer,
self.peer_set,
notification,
))
.unwrap();
}
async fn send_async_notification(
&self,
_notification: Vec<u8>,
) -> Result<(), sc_network::error::Error> {
unimplemented!();
}
}
fn new_test_network(
peerset_protocol_names: PeerSetProtocolNames,
) -> (
TestNetwork,
TestNetworkHandle,
TestAuthorityDiscovery,
Arc<Mutex<HashMap<(PeerSet, PeerId), Box<dyn MessageSink>>>>,
) {
let (action_tx, action_rx) = metered::unbounded();
let notification_sinks = Arc::new(Mutex::new(HashMap::new()));
let action_tx = Arc::new(Mutex::new(action_tx));
(
TestNetwork {
action_tx: action_tx.clone(),
peerset_protocol_names: Arc::new(peerset_protocol_names.clone()),
},
TestNetworkHandle {
action_rx,
_peerset_protocol_names: peerset_protocol_names,
action_tx,
notification_sinks: notification_sinks.clone(),
},
TestAuthorityDiscovery,
notification_sinks,
)
}
#[async_trait]
impl Network for TestNetwork {
async fn set_reserved_peers(
&mut self,
_protocol: ProtocolName,
_: HashSet<Multiaddr>,
) -> Result<(), String> {
Ok(())
}
async fn add_peers_to_reserved_set(
&mut self,
_protocol: ProtocolName,
_: HashSet<Multiaddr>,
) -> Result<(), String> {
Ok(())
}
async fn remove_from_peers_set(
&mut self,
_protocol: ProtocolName,
_: Vec<PeerId>,
) -> Result<(), String> {
Ok(())
}
async fn start_request<AD: AuthorityDiscovery>(
&self,
_: &mut AD,
_: Requests,
_: &ReqProtocolNames,
_: IfDisconnected,
) {
}
fn report_peer(&self, who: PeerId, rep: ReputationChange) {
self.action_tx
.lock()
.unbounded_send(NetworkAction::ReputationChange(who, rep))
.unwrap();
}
fn disconnect_peer(&self, who: PeerId, protocol: ProtocolName) {
let (peer_set, version) = self.peerset_protocol_names.try_get_protocol(&protocol).unwrap();
assert_eq!(version, peer_set.get_main_version());
self.action_tx
.lock()
.unbounded_send(NetworkAction::DisconnectPeer(who, peer_set))
.unwrap();
}
fn peer_role(&self, _peer_id: PeerId, handshake: Vec<u8>) -> Option<SubstrateObservedRole> {
Roles::decode_all(&mut &handshake[..])
.ok()
.and_then(|role| Some(SubstrateObservedRole::from(role)))
}
}
#[async_trait]
impl validator_discovery::AuthorityDiscovery for TestAuthorityDiscovery {
async fn get_addresses_by_authority_id(
&mut self,
_authority: AuthorityDiscoveryId,
) -> Option<HashSet<Multiaddr>> {
None
}
async fn get_authority_ids_by_peer_id(
&mut self,
_peer_id: PeerId,
) -> Option<HashSet<AuthorityDiscoveryId>> {
None
}
}
impl TestNetworkHandle {
// Get the next network action.
async fn next_network_action(&mut self) -> NetworkAction {
self.action_rx.next().await.expect("subsystem concluded early")
}
async fn connect_peer(&mut self, peer: PeerId, peer_set: PeerSet, _role: ObservedRole) {
self.notification_sinks.lock().insert(
(peer_set, peer),
Box::new(TestMessageSink::new(peer, peer_set, self.action_tx.clone())),
);
}
}
type VirtualOverseer = TestSubsystemContextHandle<NetworkBridgeTxMessage>;
struct TestHarness {
network_handle: TestNetworkHandle,
virtual_overseer: VirtualOverseer,
}
fn test_harness<T: Future<Output = VirtualOverseer>>(test: impl FnOnce(TestHarness) -> T) {
let genesis_hash = Hash::repeat_byte(0xff);
let fork_id = None;
let req_protocol_names = ReqProtocolNames::new(genesis_hash, fork_id);
let peerset_protocol_names = PeerSetProtocolNames::new(genesis_hash, fork_id);
let pool = sp_core::testing::TaskExecutor::new();
let (network, network_handle, discovery, network_notification_sinks) =
new_test_network(peerset_protocol_names.clone());
let (context, virtual_overseer) =
pezkuwi_node_subsystem_test_helpers::make_subsystem_context(pool);
let bridge_out = NetworkBridgeTx::new(
network,
discovery,
Metrics(None),
req_protocol_names,
peerset_protocol_names,
network_notification_sinks,
);
let network_bridge_out_fut = run_network_out(bridge_out, context)
.map_err(|e| panic!("bridge-out subsystem execution failed {:?}", e))
.map(|_| ());
let test_fut = test(TestHarness { network_handle, virtual_overseer });
futures::pin_mut!(test_fut);
futures::pin_mut!(network_bridge_out_fut);
let _ = executor::block_on(future::join(
async move {
let mut virtual_overseer = test_fut.await;
virtual_overseer.send(FromOrchestra::Signal(OverseerSignal::Conclude)).await;
},
network_bridge_out_fut,
));
}
#[test]
fn send_messages_to_peers() {
test_harness(|test_harness| async move {
let TestHarness { mut network_handle, mut virtual_overseer } = test_harness;
let peer = PeerId::random();
network_handle
.connect_peer(peer, PeerSet::Validation, ObservedRole::Full)
.timeout(TIMEOUT)
.await
.expect("Timeout does not occur");
// the outgoing side does not consume network messages
// so the single item sink has to be free explicitly
network_handle
.connect_peer(peer, PeerSet::Collation, ObservedRole::Full)
.timeout(TIMEOUT)
.await
.expect("Timeout does not occur");
// send a validation protocol message.
{
let approval_distribution_message =
protocol_v3::ApprovalDistributionMessage::Approvals(Vec::new());
let message_v1 = protocol_v3::ValidationProtocol::ApprovalDistribution(
approval_distribution_message.clone(),
);
virtual_overseer
.send(FromOrchestra::Communication {
msg: NetworkBridgeTxMessage::SendValidationMessage(
vec![peer],
ValidationProtocols::V3(message_v1.clone()),
),
})
.timeout(TIMEOUT)
.await
.expect("Timeout does not occur");
assert_eq!(
network_handle
.next_network_action()
.timeout(TIMEOUT)
.await
.expect("Timeout does not occur"),
NetworkAction::WriteNotification(
peer,
PeerSet::Validation,
WireMessage::ProtocolMessage(message_v1).encode(),
)
);
}
// send a collation protocol message.
{
let collator_protocol_message = protocol_v1::CollatorProtocolMessage::Declare(
Sr25519Keyring::Alice.public().into(),
0_u32.into(),
dummy_collator_signature(),
);
let message_v1 =
protocol_v1::CollationProtocol::CollatorProtocol(collator_protocol_message.clone());
virtual_overseer
.send(FromOrchestra::Communication {
msg: NetworkBridgeTxMessage::SendCollationMessage(
vec![peer],
CollationProtocols::V1(message_v1.clone()),
),
})
.await;
assert_eq!(
network_handle
.next_network_action()
.timeout(TIMEOUT)
.await
.expect("Timeout does not occur"),
NetworkAction::WriteNotification(
peer,
PeerSet::Collation,
WireMessage::ProtocolMessage(message_v1).encode(),
)
);
}
virtual_overseer
});
}
@@ -0,0 +1,413 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//! A validator discovery service for the Network Bridge.
use crate::Network;
use core::marker::PhantomData;
use std::collections::HashSet;
use futures::channel::oneshot;
use sc_network::multiaddr::{self, Multiaddr};
pub use pezkuwi_node_network_protocol::authority_discovery::AuthorityDiscovery;
use pezkuwi_node_network_protocol::{
peer_set::{PeerSet, PeerSetProtocolNames, PerPeerSet},
PeerId,
};
use pezkuwi_primitives::AuthorityDiscoveryId;
const LOG_TARGET: &str = "teyrchain::validator-discovery";
pub(super) struct Service<N, AD> {
state: PerPeerSet<StatePerPeerSet>,
peerset_protocol_names: PeerSetProtocolNames,
// PhantomData used to make the struct generic instead of having generic methods
_phantom: PhantomData<(N, AD)>,
}
#[derive(Default)]
struct StatePerPeerSet {
previously_requested: HashSet<PeerId>,
}
impl<N: Network, AD: AuthorityDiscovery> Service<N, AD> {
pub fn new(peerset_protocol_names: PeerSetProtocolNames) -> Self {
Self { state: Default::default(), peerset_protocol_names, _phantom: PhantomData }
}
/// Connect to already resolved addresses.
pub async fn on_resolved_request(
&mut self,
newly_requested: HashSet<Multiaddr>,
peer_set: PeerSet,
mut network_service: N,
) -> N {
let state = &mut self.state[peer_set];
let new_peer_ids: HashSet<PeerId> = extract_peer_ids(newly_requested.iter().cloned());
let num_peers = new_peer_ids.len();
let peers_to_remove: Vec<PeerId> =
state.previously_requested.difference(&new_peer_ids).cloned().collect();
let removed = peers_to_remove.len();
state.previously_requested = new_peer_ids;
gum::debug!(
target: LOG_TARGET,
?peer_set,
?num_peers,
?removed,
"New ConnectToValidators resolved request",
);
// ask the network to connect to these nodes and not disconnect
// from them until removed from the set
//
// for peer-set management, the main protocol name should be used regardless of
// the negotiated version.
if let Err(e) = network_service
.set_reserved_peers(
self.peerset_protocol_names.get_main_name(peer_set),
newly_requested,
)
.await
{
gum::warn!(target: LOG_TARGET, err = ?e, "AuthorityDiscoveryService returned an invalid multiaddress");
}
network_service
}
/// Connect to already resolved addresses.
pub async fn on_add_to_resolved_request(
&mut self,
newly_requested: HashSet<Multiaddr>,
peer_set: PeerSet,
mut network_service: N,
) -> N {
let state = &mut self.state[peer_set];
let new_peer_ids: HashSet<PeerId> = extract_peer_ids(newly_requested.iter().cloned());
let num_peers = new_peer_ids.len();
state.previously_requested.extend(new_peer_ids);
gum::debug!(
target: LOG_TARGET,
?peer_set,
?num_peers,
"New add to resolved validators request",
);
// ask the network to connect to these nodes and not disconnect
// from them until they are removed from the set.
//
// for peer-set management, the main protocol name should be used regardless of
// the negotiated version.
if let Err(e) = network_service
.add_peers_to_reserved_set(
self.peerset_protocol_names.get_main_name(peer_set),
newly_requested,
)
.await
{
gum::warn!(target: LOG_TARGET, err = ?e, "AuthorityDiscoveryService returned an invalid multiaddress");
}
network_service
}
/// On a new connection request, a peer set update will be issued.
/// It will ask the network to connect to the validators and not disconnect
/// from them at least until the next request is issued for the same peer set.
///
/// This method will also disconnect from previously connected validators not in the
/// `validator_ids` set. it takes `network_service` and `authority_discovery_service` by value
/// and returns them as a workaround for the Future: Send requirement imposed by async function
/// implementation.
pub async fn on_request(
&mut self,
validator_ids: Vec<AuthorityDiscoveryId>,
peer_set: PeerSet,
failed: oneshot::Sender<usize>,
network_service: N,
mut authority_discovery_service: AD,
) -> (N, AD) {
// collect multiaddress of validators
let mut failed_to_resolve: usize = 0;
let mut newly_requested = HashSet::new();
let requested = validator_ids.len();
for authority in validator_ids.into_iter() {
let result = authority_discovery_service
.get_addresses_by_authority_id(authority.clone())
.await;
if let Some(addresses) = result {
newly_requested.extend(addresses);
} else {
failed_to_resolve += 1;
gum::debug!(
target: LOG_TARGET,
"Authority Discovery couldn't resolve {:?}",
authority
);
}
}
gum::debug!(
target: LOG_TARGET,
?peer_set,
?requested,
?failed_to_resolve,
"New ConnectToValidators request",
);
let r = self.on_resolved_request(newly_requested, peer_set, network_service).await;
let _ = failed.send(failed_to_resolve);
(r, authority_discovery_service)
}
}
fn extract_peer_ids(multiaddr: impl Iterator<Item = Multiaddr>) -> HashSet<PeerId> {
multiaddr
.filter_map(|mut addr| match addr.pop() {
Some(multiaddr::Protocol::P2p(key)) => PeerId::from_multihash(key).ok(),
_ => None,
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
use crate::network::Network;
use async_trait::async_trait;
use pezkuwi_node_network_protocol::{
request_response::{outgoing::Requests, ReqProtocolNames},
PeerId,
};
use pezkuwi_primitives::Hash;
use sc_network::{IfDisconnected, ProtocolName, ReputationChange};
use sp_keyring::Sr25519Keyring;
use std::collections::{HashMap, HashSet};
fn new_service() -> Service<TestNetwork, TestAuthorityDiscovery> {
let genesis_hash = Hash::repeat_byte(0xff);
let fork_id = None;
let protocol_names = PeerSetProtocolNames::new(genesis_hash, fork_id);
Service::new(protocol_names)
}
fn new_network() -> (TestNetwork, TestAuthorityDiscovery) {
(TestNetwork::default(), TestAuthorityDiscovery::new())
}
#[derive(Default, Clone)]
struct TestNetwork {
peers_set: HashSet<PeerId>,
}
#[derive(Default, Clone, Debug)]
struct TestAuthorityDiscovery {
by_authority_id: HashMap<AuthorityDiscoveryId, HashSet<Multiaddr>>,
by_peer_id: HashMap<PeerId, HashSet<AuthorityDiscoveryId>>,
}
impl TestAuthorityDiscovery {
fn new() -> Self {
let peer_ids = known_peer_ids();
let authorities = known_authorities();
let multiaddr = known_multiaddr().into_iter().zip(peer_ids.iter().cloned()).map(
|(mut addr, peer_id)| {
addr.push(multiaddr::Protocol::P2p(peer_id.into()));
HashSet::from([addr])
},
);
Self {
by_authority_id: authorities.iter().cloned().zip(multiaddr).collect(),
by_peer_id: peer_ids
.into_iter()
.zip(authorities.into_iter().map(|a| HashSet::from([a])))
.collect(),
}
}
}
#[async_trait]
impl Network for TestNetwork {
async fn set_reserved_peers(
&mut self,
_protocol: ProtocolName,
multiaddresses: HashSet<Multiaddr>,
) -> Result<(), String> {
self.peers_set = extract_peer_ids(multiaddresses.into_iter());
Ok(())
}
async fn add_peers_to_reserved_set(
&mut self,
_protocol: ProtocolName,
multiaddresses: HashSet<Multiaddr>,
) -> Result<(), String> {
self.peers_set.extend(extract_peer_ids(multiaddresses.into_iter()));
Ok(())
}
async fn remove_from_peers_set(
&mut self,
_protocol: ProtocolName,
peers: Vec<PeerId>,
) -> Result<(), String> {
self.peers_set.retain(|elem| !peers.contains(elem));
Ok(())
}
async fn start_request<AD: AuthorityDiscovery>(
&self,
_: &mut AD,
_: Requests,
_: &ReqProtocolNames,
_: IfDisconnected,
) {
}
fn report_peer(&self, _: PeerId, _: ReputationChange) {
panic!()
}
fn disconnect_peer(&self, _: PeerId, _: ProtocolName) {
panic!()
}
fn peer_role(
&self,
_peer_id: PeerId,
_handshake: Vec<u8>,
) -> Option<sc_network::ObservedRole> {
panic!()
}
}
#[async_trait]
impl AuthorityDiscovery for TestAuthorityDiscovery {
async fn get_addresses_by_authority_id(
&mut self,
authority: AuthorityDiscoveryId,
) -> Option<HashSet<Multiaddr>> {
self.by_authority_id.get(&authority).cloned()
}
async fn get_authority_ids_by_peer_id(
&mut self,
peer_id: PeerId,
) -> Option<HashSet<AuthorityDiscoveryId>> {
self.by_peer_id.get(&peer_id).cloned()
}
}
fn known_authorities() -> Vec<AuthorityDiscoveryId> {
[Sr25519Keyring::Alice, Sr25519Keyring::Bob, Sr25519Keyring::Charlie]
.iter()
.map(|k| k.public().into())
.collect()
}
fn known_peer_ids() -> Vec<PeerId> {
(0..3).map(|_| PeerId::random()).collect()
}
fn known_multiaddr() -> Vec<Multiaddr> {
vec![
"/ip4/127.0.0.1/tcp/1234".parse().unwrap(),
"/ip4/127.0.0.1/tcp/1235".parse().unwrap(),
"/ip4/127.0.0.1/tcp/1236".parse().unwrap(),
]
}
// Test cleanup works.
#[test]
fn old_multiaddrs_are_removed_on_new_request() {
let mut service = new_service();
let (ns, ads) = new_network();
let authority_ids: Vec<_> =
ads.by_peer_id.values().flat_map(|v| v.iter()).cloned().collect();
futures::executor::block_on(async move {
let (failed, _) = oneshot::channel();
let (ns, ads) = service
.on_request(vec![authority_ids[0].clone()], PeerSet::Validation, failed, ns, ads)
.await;
let (failed, _) = oneshot::channel();
let (_, ads) = service
.on_request(vec![authority_ids[1].clone()], PeerSet::Validation, failed, ns, ads)
.await;
let state = &service.state[PeerSet::Validation];
assert_eq!(state.previously_requested.len(), 1);
let peer_1 = extract_peer_ids(
ads.by_authority_id.get(&authority_ids[1]).unwrap().clone().into_iter(),
)
.iter()
.cloned()
.next()
.unwrap();
assert!(state.previously_requested.contains(&peer_1));
});
}
#[test]
fn failed_resolution_is_reported_properly() {
let mut service = new_service();
let (ns, ads) = new_network();
let authority_ids: Vec<_> =
ads.by_peer_id.values().flat_map(|v| v.iter()).cloned().collect();
futures::executor::block_on(async move {
let (failed, failed_rx) = oneshot::channel();
let unknown = Sr25519Keyring::Ferdie.public().into();
let (_, ads) = service
.on_request(
vec![authority_ids[0].clone(), unknown],
PeerSet::Validation,
failed,
ns,
ads,
)
.await;
let state = &service.state[PeerSet::Validation];
assert_eq!(state.previously_requested.len(), 1);
let peer_0 = extract_peer_ids(
ads.by_authority_id.get(&authority_ids[0]).unwrap().clone().into_iter(),
)
.iter()
.cloned()
.next()
.unwrap();
assert!(state.previously_requested.contains(&peer_0));
let failed = failed_rx.await.unwrap();
assert_eq!(failed, 1);
});
}
}
@@ -0,0 +1,71 @@
[package]
name = "pezkuwi-collator-protocol"
version = "7.0.0"
description = "Pezkuwi Collator Protocol subsystem. Allows collators and validators to talk to each other."
authors.workspace = true
edition.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
[lints]
workspace = true
[dependencies]
async-trait = { workspace = true, optional = true }
bitvec = { features = ["alloc"], workspace = true }
futures = { workspace = true }
futures-timer = { workspace = true }
gum = { workspace = true, default-features = true }
schnellru = { workspace = true }
sp-core = { workspace = true, default-features = true }
sp-keystore = { workspace = true, default-features = true }
sp-runtime = { workspace = true, default-features = true }
fatality = { workspace = true }
pezkuwi-node-network-protocol = { workspace = true, default-features = true }
pezkuwi-node-primitives = { workspace = true, default-features = true }
pezkuwi-node-subsystem = { workspace = true, default-features = true }
pezkuwi-node-subsystem-util = { workspace = true, default-features = true }
pezkuwi-primitives = { workspace = true, default-features = true }
thiserror = { workspace = true }
tokio-util = { workspace = true }
# This should have really been a dev-dependency but clippy is complaining that it's not used with
# experimental-collator-protocol disabled, while the rust compiler claims that having optional
# dev-dependencies is not possible.
tokio = { features = [
"macros",
], workspace = true, default-features = true, optional = true }
[dev-dependencies]
assert_matches = { workspace = true }
rstest = { workspace = true }
sp-tracing = { workspace = true }
codec = { features = ["std"], workspace = true, default-features = true }
sc-keystore = { workspace = true, default-features = true }
sc-network = { workspace = true, default-features = true }
sp-core = { features = ["std"], workspace = true, default-features = true }
sp-keyring = { workspace = true, default-features = true }
itertools = { workspace = true }
pezkuwi-node-subsystem-test-helpers = { workspace = true }
pezkuwi-primitives-test-helpers = { workspace = true }
[features]
default = []
experimental-collator-protocol = ["async-trait", "tokio"]
runtime-benchmarks = [
"gum/runtime-benchmarks",
"pezkuwi-node-network-protocol/runtime-benchmarks",
"pezkuwi-node-primitives/runtime-benchmarks",
"pezkuwi-node-subsystem-test-helpers/runtime-benchmarks",
"pezkuwi-node-subsystem-util/runtime-benchmarks",
"pezkuwi-node-subsystem/runtime-benchmarks",
"pezkuwi-primitives-test-helpers/runtime-benchmarks",
"pezkuwi-primitives/runtime-benchmarks",
"sc-network/runtime-benchmarks",
"sp-keyring/runtime-benchmarks",
"sp-runtime/runtime-benchmarks",
]
@@ -0,0 +1,165 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//! Primitives for tracking collations-related data.
use std::collections::{HashSet, VecDeque};
use futures::{future::BoxFuture, stream::FuturesUnordered};
use pezkuwi_node_network_protocol::{
request_response::{incoming::OutgoingResponse, v2 as protocol_v2, IncomingRequest},
PeerId,
};
use pezkuwi_node_primitives::PoV;
use pezkuwi_primitives::{
CandidateHash, CandidateReceiptV2 as CandidateReceipt, Hash, HeadData, Id as ParaId,
};
/// The status of a collation as seen from the collator.
#[derive(Clone, Debug, PartialEq)]
pub enum CollationStatus {
/// The collation was created, but we did not advertise it to any validator.
Created,
/// The collation was advertised to at least one validator.
Advertised,
/// The collation was requested by at least one validator.
Requested,
}
impl CollationStatus {
/// Advance to the [`Self::Advertised`] status.
///
/// This ensures that `self` isn't already [`Self::Requested`].
pub fn advance_to_advertised(&mut self) {
if !matches!(self, Self::Requested) {
*self = Self::Advertised;
}
}
/// Advance to the [`Self::Requested`] status.
pub fn advance_to_requested(&mut self) {
*self = Self::Requested;
}
/// Return label for metrics.
pub fn label(&self) -> &'static str {
match self {
CollationStatus::Created => "created",
CollationStatus::Advertised => "advertised",
CollationStatus::Requested => "requested",
}
}
}
/// A collation built by the collator.
pub struct Collation {
/// Candidate receipt.
pub receipt: CandidateReceipt,
/// Proof to verify the state transition of the teyrchain.
pub pov: PoV,
/// Parent head-data
pub parent_head_data: HeadData,
/// Collation status.
pub status: CollationStatus,
}
/// Stores the state for waiting collation fetches per relay parent.
#[derive(Default)]
pub struct WaitingCollationFetches {
/// A flag indicating that we have an ongoing request.
/// This limits the number of collations being sent at any moment
/// of time to 1 for each relay parent.
///
/// If set to `true`, any new request will be queued.
pub collation_fetch_active: bool,
/// The collation fetches waiting to be fulfilled.
pub req_queue: VecDeque<VersionedCollationRequest>,
/// All peers that are waiting or actively uploading.
///
/// We will not accept multiple requests from the same peer, otherwise our DoS protection of
/// moving on to the next peer after `MAX_UNSHARED_UPLOAD_TIME` would be pointless.
pub waiting_peers: HashSet<(PeerId, CandidateHash)>,
}
/// Backwards-compatible wrapper for incoming collations requests.
pub enum VersionedCollationRequest {
V2(IncomingRequest<protocol_v2::CollationFetchingRequest>),
}
impl From<IncomingRequest<protocol_v2::CollationFetchingRequest>> for VersionedCollationRequest {
fn from(req: IncomingRequest<protocol_v2::CollationFetchingRequest>) -> Self {
Self::V2(req)
}
}
impl VersionedCollationRequest {
/// Returns teyrchain id from the request payload.
pub fn para_id(&self) -> ParaId {
match self {
VersionedCollationRequest::V2(req) => req.payload.para_id,
}
}
/// Returns candidate hash from the request payload.
pub fn candidate_hash(&self) -> CandidateHash {
match self {
VersionedCollationRequest::V2(req) => req.payload.candidate_hash,
}
}
/// Returns relay parent from the request payload.
pub fn relay_parent(&self) -> Hash {
match self {
VersionedCollationRequest::V2(req) => req.payload.relay_parent,
}
}
/// Returns id of the peer the request was received from.
pub fn peer_id(&self) -> PeerId {
match self {
VersionedCollationRequest::V2(req) => req.peer,
}
}
/// Sends the response back to requester.
pub fn send_outgoing_response(
self,
response: OutgoingResponse<protocol_v2::CollationFetchingResponse>,
) -> Result<(), ()> {
match self {
VersionedCollationRequest::V2(req) => req.send_outgoing_response(response),
}
}
}
/// Result of the finished background send-collation task.
///
/// Note that if the timeout was hit the request doesn't get
/// aborted, it only indicates that we should start processing
/// the next one from the queue.
pub struct CollationSendResult {
/// Candidate's relay parent.
pub relay_parent: Hash,
/// Candidate hash.
pub candidate_hash: CandidateHash,
/// Peer id.
pub peer_id: PeerId,
/// Whether the max unshared timeout was hit.
pub timed_out: bool,
}
pub type ActiveCollationFetches = FuturesUnordered<BoxFuture<'static, CollationSendResult>>;
@@ -0,0 +1,66 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use pezkuwi_node_network_protocol::request_response::incoming;
use pezkuwi_node_primitives::UncheckedSignedFullStatement;
use pezkuwi_node_subsystem::{errors::SubsystemError, RuntimeApiError};
use pezkuwi_node_subsystem_util::{backing_implicit_view, runtime};
use crate::LOG_TARGET;
/// General result.
pub type Result<T> = std::result::Result<T, Error>;
use fatality::Nested;
#[allow(missing_docs)]
#[fatality::fatality(splitable)]
pub enum Error {
#[fatal]
#[error("Receiving message from overseer failed")]
SubsystemReceive(#[from] SubsystemError),
#[fatal(forward)]
#[error("Retrieving next incoming request failed")]
IncomingRequest(#[from] incoming::Error),
#[fatal(forward)]
#[error("Error while accessing runtime information")]
Runtime(#[from] runtime::Error),
#[error("Error while accessing Runtime API")]
RuntimeApi(#[from] RuntimeApiError),
#[error(transparent)]
ImplicitViewFetchError(backing_implicit_view::FetchError),
#[error("CollationSeconded contained statement with invalid signature")]
InvalidStatementSignature(UncheckedSignedFullStatement),
}
/// Utility for eating top level errors and log them.
///
/// We basically always want to try and continue on error. This utility function is meant to
/// consume top-level errors by simply logging them.
pub fn log_error(result: Result<()>, ctx: &'static str) -> std::result::Result<(), FatalError> {
match result.into_nested()? {
Ok(()) => Ok(()),
Err(jfyi) => {
gum::warn!(target: LOG_TARGET, error = ?jfyi, ctx);
Ok(())
},
}
}
@@ -0,0 +1,608 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use std::{
collections::HashMap,
time::{Duration, Instant},
};
use pezkuwi_node_subsystem::prometheus::prometheus::HistogramTimer;
use pezkuwi_node_subsystem_util::metrics::{self, prometheus};
use pezkuwi_primitives::{BlockNumber, CandidateReceiptV2 as CandidateReceipt, Hash};
use sp_core::H256;
use super::collation::CollationStatus;
#[derive(Clone, Default)]
pub struct Metrics(Option<MetricsInner>);
impl Metrics {
/// Record the time a collation took to be backed.
pub fn on_collation_backed(&self, latency: f64) {
if let Some(metrics) = &self.0 {
metrics.collation_backing_latency.observe(latency);
}
}
/// Record the time a collation took to be included.
pub fn on_collation_included(&self, latency: f64) {
if let Some(metrics) = &self.0 {
metrics.collation_inclusion_latency.observe(latency);
}
}
pub fn on_advertisement_made(&self) {
if let Some(metrics) = &self.0 {
metrics.advertisements_made.inc();
}
}
pub fn on_collation_sent_requested(&self) {
if let Some(metrics) = &self.0 {
metrics.collations_send_requested.inc();
}
}
pub fn on_collation_sent(&self) {
if let Some(metrics) = &self.0 {
metrics.collations_sent.inc();
}
}
/// Provide a timer for `process_msg` which observes on drop.
pub fn time_process_msg(&self) -> Option<prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.process_msg.start_timer())
}
/// Provide a timer for `distribute_collation` which observes on drop.
pub fn time_collation_distribution(
&self,
label: &'static str,
) -> Option<prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| {
metrics.collation_distribution_time.with_label_values(&[label]).start_timer()
})
}
/// Create a timer to measure how much time collations spend before being fetched.
pub fn time_collation_fetch_latency(&self) -> Option<prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.collation_fetch_latency.start_timer())
}
/// Create a timer to measure how much time it takes for fetched collations to be backed.
pub fn time_collation_backing_latency(&self) -> Option<prometheus::prometheus::HistogramTimer> {
self.0
.as_ref()
.map(|metrics| metrics.collation_backing_latency_time.start_timer())
}
/// Record the time a collation took before expiring.
/// Collations can expire in the following states: "advertised, fetched or backed"
pub fn on_collation_expired(&self, latency: f64, state: &'static str) {
if let Some(metrics) = &self.0 {
metrics.collation_expired_total.with_label_values(&[state]).observe(latency);
}
}
}
#[derive(Clone)]
struct MetricsInner {
advertisements_made: prometheus::Counter<prometheus::U64>,
collations_sent: prometheus::Counter<prometheus::U64>,
collations_send_requested: prometheus::Counter<prometheus::U64>,
process_msg: prometheus::Histogram,
collation_distribution_time: prometheus::HistogramVec,
collation_fetch_latency: prometheus::Histogram,
collation_backing_latency_time: prometheus::Histogram,
collation_backing_latency: prometheus::Histogram,
collation_inclusion_latency: prometheus::Histogram,
collation_expired_total: prometheus::HistogramVec,
}
impl metrics::Metrics for Metrics {
fn try_register(
registry: &prometheus::Registry,
) -> std::result::Result<Self, prometheus::PrometheusError> {
let metrics = MetricsInner {
advertisements_made: prometheus::register(
prometheus::Counter::new(
"pezkuwi_teyrchain_collation_advertisements_made_total",
"A number of collation advertisements sent to validators.",
)?,
registry,
)?,
collations_send_requested: prometheus::register(
prometheus::Counter::new(
"pezkuwi_teyrchain_collations_sent_requested_total",
"A number of collations requested to be sent to validators.",
)?,
registry,
)?,
collations_sent: prometheus::register(
prometheus::Counter::new(
"pezkuwi_teyrchain_collations_sent_total",
"A number of collations sent to validators.",
)?,
registry,
)?,
process_msg: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"pezkuwi_teyrchain_collator_protocol_collator_process_msg",
"Time spent within `collator_protocol_collator::process_msg`",
)
.buckets(vec![
0.001, 0.002, 0.005, 0.01, 0.025, 0.05, 0.1, 0.15, 0.25, 0.35, 0.5, 0.75,
1.0,
]),
)?,
registry,
)?,
collation_distribution_time: prometheus::register(
prometheus::HistogramVec::new(
prometheus::HistogramOpts::new(
"pezkuwi_teyrchain_collator_protocol_collator_distribution_time",
"Time spent within `collator_protocol_collator::distribute_collation`",
)
.buckets(vec![
0.001, 0.002, 0.005, 0.01, 0.025, 0.05, 0.1, 0.15, 0.25, 0.35, 0.5, 0.75,
1.0,
]),
&["state"],
)?,
registry,
)?,
collation_fetch_latency: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"pezkuwi_teyrchain_collation_fetch_latency",
"How much time collations spend waiting to be fetched",
)
.buckets(vec![
0.001, 0.01, 0.025, 0.05, 0.1, 0.15, 0.25, 0.35, 0.5, 0.75, 1.0, 2.0, 5.0,
]),
)?,
registry,
)?,
collation_backing_latency_time: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"pezkuwi_teyrchain_collation_backing_latency_time",
"How much time it takes for a fetched collation to be backed",
)
.buckets(vec![
1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 8.0, 10.0, 12.0, 15.0, 18.0, 24.0, 30.0,
]),
)?,
registry,
)?,
collation_backing_latency: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"pezkuwi_teyrchain_collation_backing_latency",
"How many blocks away from the relay parent are collations backed",
)
.buckets(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]),
)?,
registry,
)?,
collation_inclusion_latency: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"pezkuwi_teyrchain_collation_inclusion_latency",
"How many blocks it takes for a backed collation to be included",
)
.buckets(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]),
)?,
registry,
)?,
collation_expired_total: prometheus::register(
prometheus::HistogramVec::new(
prometheus::HistogramOpts::new(
"pezkuwi_teyrchain_collation_expired",
"How many collations expired (not backed or not included)",
)
.buckets(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]),
&["state"],
)?,
registry,
)?,
};
Ok(Metrics(Some(metrics)))
}
}
// Equal to claim queue length.
pub(crate) const MAX_BACKING_DELAY: BlockNumber = 3;
// Paras availability period. In practice, candidates time out in exceptional situations.
pub(crate) const MAX_AVAILABILITY_DELAY: BlockNumber = 10;
/// Collations are kept in the tracker, until they are included or expired
#[derive(Default)]
pub(crate) struct CollationTracker {
/// All un-expired collation entries
entries: HashMap<Hash, CollationStats>,
}
impl CollationTracker {
/// Mark a tracked collation as backed.
///
/// Block built on top of N is earliest backed at N + 1.
pub fn collation_backed(
&mut self,
block_number: BlockNumber,
leaf: H256,
receipt: CandidateReceipt,
) {
let head = receipt.descriptor.para_head();
let Some(entry) = self.entries.get_mut(&head) else {
gum::debug!(
target: crate::LOG_TARGET_STATS,
?head,
"Backed collation not found in tracker",
);
return;
};
if entry.backed().is_some() {
gum::debug!(
target: crate::LOG_TARGET_STATS,
?head,
"Collation already backed in a fork, skipping",
);
return;
}
entry.set_backed_at(block_number);
if let Some(latency) = entry.backed() {
// Observe the backing latency since the collation was fetched.
let maybe_latency =
entry.backed_latency_metric.take().map(|metric| metric.stop_and_record());
gum::debug!(
target: crate::LOG_TARGET_STATS,
latency_blocks = ?latency,
latency_time = ?maybe_latency,
relay_block = ?leaf,
relay_parent = ?entry.relay_parent,
para_id = ?receipt.descriptor.para_id(),
?head,
"A fetched collation was backed on relay chain",
);
}
}
/// Mark a previously backed collation as included.
///
/// Block built on top of N is earliest included at N + 2.
pub fn collation_included(
&mut self,
block_number: BlockNumber,
leaf: H256,
receipt: CandidateReceipt,
) {
let head = receipt.descriptor.para_head();
let para_id = receipt.descriptor.para_id();
let Some(entry) = self.entries.get_mut(&head) else {
gum::debug!(
target: crate::LOG_TARGET_STATS,
?para_id,
?head,
"Included collation not found in tracker",
);
return;
};
let pov_hash = entry.pov_hash();
let candidate_hash = entry.candidate_hash();
if entry.included().is_some() {
gum::debug!(
target: crate::LOG_TARGET_STATS,
?para_id,
?head,
?candidate_hash,
?pov_hash,
"Collation already included in a fork, skipping",
);
return;
}
entry.set_included_at(block_number);
if let Some(latency) = entry.included() {
gum::debug!(
target: crate::LOG_TARGET_STATS,
?latency,
relay_block = ?leaf,
relay_parent = ?entry.relay_parent,
?para_id,
?head,
?candidate_hash,
?pov_hash,
"Collation included on relay chain",
);
}
}
/// Returns all the collations that have expired at `block_number`.
pub fn drain_expired(&mut self, block_number: BlockNumber) -> Vec<CollationStats> {
let expired = self
.entries
.iter()
.filter_map(|(head, entry)| entry.is_tracking_expired(block_number).then_some(*head))
.collect::<Vec<_>>();
expired
.iter()
.filter_map(|head| self.entries.remove(head))
.map(|mut entry| {
entry.set_expired_at(block_number);
entry
})
.collect::<Vec<_>>()
}
/// Drain and return all collations that are possibly finalized at `block_number`.
///
/// We only track the inclusion block number, not the inclusion block hash.
/// There is a small chance that a collation was included in a fork that is not finalized.
pub fn drain_finalized(&mut self, block_number: BlockNumber) -> Vec<CollationStats> {
let finalized = self
.entries
.iter()
.filter_map(|(head, entry)| entry.is_possibly_finalized(block_number).then_some(*head))
.collect::<Vec<_>>();
finalized
.iter()
.filter_map(|head| self.entries.remove(head))
.collect::<Vec<_>>()
}
/// Track a collation for a given period of time (TTL). TTL depends
/// on the collation state.
/// Collation is evicted after it expires.
pub fn track(&mut self, mut stats: CollationStats) {
// Disable the fetch timer, to prevent bogus observe on drop.
if let Some(fetch_latency_metric) = stats.fetch_latency_metric.take() {
fetch_latency_metric.stop_and_discard();
}
if let Some(entry) = self
.entries
.values()
.find(|entry| entry.relay_parent_number == stats.relay_parent_number)
{
gum::debug!(
target: crate::LOG_TARGET_STATS,
?stats.relay_parent_number,
?stats.relay_parent,
entry_relay_parent = ?entry.relay_parent,
"Collation built on a fork",
);
}
self.entries.insert(stats.head, stats);
}
}
/// Information about how collations live their lives.
pub(crate) struct CollationStats {
/// The pre-backing collation status information
pre_backing_status: CollationStatus,
/// The block header hash.
head: Hash,
/// The relay parent on top of which collation was built
relay_parent_number: BlockNumber,
/// The relay parent hash.
relay_parent: Hash,
/// The expiration block number if expired.
expired_at: Option<BlockNumber>,
/// The backed block number.
backed_at: Option<BlockNumber>,
/// The included block number if backed.
included_at: Option<BlockNumber>,
/// The collation fetch time.
fetched_at: Option<Instant>,
/// Advertisement time
advertised_at: Instant,
/// The collation fetch latency (seconds).
fetch_latency_metric: Option<HistogramTimer>,
/// The collation backing latency (seconds). Duration since collation fetched
/// until the import of a relay chain block where collation is backed.
backed_latency_metric: Option<HistogramTimer>,
/// The Collation candidate hash
candidate_hash: Hash,
/// The Collation PoV hash
pov_hash: Hash,
}
impl CollationStats {
/// Create new empty instance.
pub fn new(
head: Hash,
relay_parent_number: BlockNumber,
relay_parent: Hash,
metrics: &Metrics,
candidate_hash: Hash,
pov_hash: Hash,
) -> Self {
Self {
pre_backing_status: CollationStatus::Created,
head,
relay_parent_number,
relay_parent,
advertised_at: std::time::Instant::now(),
backed_at: None,
expired_at: None,
fetched_at: None,
included_at: None,
fetch_latency_metric: metrics.time_collation_fetch_latency(),
backed_latency_metric: None,
candidate_hash,
pov_hash,
}
}
/// Returns the hash and number of the relay parent.
pub fn relay_parent(&self) -> (Hash, BlockNumber) {
(self.relay_parent, self.relay_parent_number)
}
/// Returns the age at which the collation expired.
pub fn expired(&self) -> Option<BlockNumber> {
let expired_at = self.expired_at?;
Some(expired_at.saturating_sub(self.relay_parent_number))
}
/// Returns the age of the collation at the moment of backing.
pub fn backed(&self) -> Option<BlockNumber> {
let backed_at = self.backed_at?;
Some(backed_at.saturating_sub(self.relay_parent_number))
}
/// Returns the age of the collation at the moment of inclusion.
pub fn included(&self) -> Option<BlockNumber> {
let included_at = self.included_at?;
let backed_at = self.backed_at?;
Some(included_at.saturating_sub(backed_at))
}
/// Returns time the collation waited to be fetched.
pub fn fetch_latency(&self) -> Option<Duration> {
let fetched_at = self.fetched_at?;
Some(fetched_at - self.advertised_at)
}
/// Get teyrchain block header hash.
pub fn head(&self) -> H256 {
self.head
}
/// Get candidate hash.
pub fn candidate_hash(&self) -> H256 {
self.candidate_hash
}
/// Get candidate PoV hash.
pub fn pov_hash(&self) -> H256 {
self.pov_hash
}
/// Set the timestamp at which collation is fetched.
pub fn set_fetched_at(&mut self, fetched_at: Instant) {
self.fetched_at = Some(fetched_at);
}
/// Set the timestamp at which collation is backed.
pub fn set_backed_at(&mut self, backed_at: BlockNumber) {
self.backed_at = Some(backed_at);
}
/// Set the timestamp at which collation is included.
pub fn set_included_at(&mut self, included_at: BlockNumber) {
self.included_at = Some(included_at);
}
/// Set the timestamp at which collation is expired.
pub fn set_expired_at(&mut self, expired_at: BlockNumber) {
self.expired_at = Some(expired_at);
}
/// Sets the pre-backing status of the collation.
pub fn set_pre_backing_status(&mut self, status: CollationStatus) {
self.pre_backing_status = status;
}
/// Returns the pre-backing status of the collation.
pub fn pre_backing_status(&self) -> &CollationStatus {
&self.pre_backing_status
}
/// Take the fetch latency metric timer.
pub fn take_fetch_latency_metric(&mut self) -> Option<HistogramTimer> {
self.fetch_latency_metric.take()
}
/// Set the backing latency metric timer.
pub fn set_backed_latency_metric(&mut self, timer: Option<HistogramTimer>) {
self.backed_latency_metric = timer;
}
/// Returns the time to live for the collation.
pub fn tracking_ttl(&self) -> BlockNumber {
if self.fetch_latency().is_none() {
0 // Collation was never fetched, expires ASAP
} else if self.backed().is_none() {
MAX_BACKING_DELAY
} else if self.included().is_none() {
self.backed().expect("backed, checked above") + MAX_AVAILABILITY_DELAY
} else {
0 // If block included no reason to track it.
}
}
/// Returns the state of the collation at the moment of expiry.
pub fn expiry_state(&self) -> &'static str {
if self.fetch_latency().is_none() {
// If collation was not fetched, we rely on the status provided
// by the collator protocol.
self.pre_backing_status().label()
} else if self.backed().is_none() {
"fetched"
} else if self.included().is_none() {
"backed"
} else {
"none"
}
}
/// Returns true if the collation is expired.
pub fn is_tracking_expired(&self, current_block: BlockNumber) -> bool {
// Don't expire included collations
if self.included().is_some() {
return false;
}
let expiry_block = self.relay_parent_number + self.tracking_ttl();
expiry_block <= current_block
}
/// Check if this collation is possibly finalized based on block number.
///
/// Returns `true` if the collation was included at or before `last_finalized`.
///
/// We only track the inclusion block number, not the inclusion block hash.
/// There is a small chance that a collation was included in a fork that is not finalized.
pub fn is_possibly_finalized(&self, last_finalized: BlockNumber) -> bool {
self.included_at
.map(|included_at| included_at <= last_finalized)
.unwrap_or_default()
}
}
impl Drop for CollationStats {
fn drop(&mut self) {
if let Some(fetch_latency_metric) = self.fetch_latency_metric.take() {
// This metric is only observed when collation was sent fully to the validator.
//
// If `fetch_latency_metric` is Some it means that the metrics was observed.
// We don't want to observe it again and report a higher value at a later point in time.
fetch_latency_metric.stop_and_discard();
}
// If timer still exists, drop it. It is measured in `collation_backed`.
if let Some(backed_latency_metric) = self.backed_latency_metric.take() {
backed_latency_metric.stop_and_discard();
}
}
}
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,883 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//! Tests for the collator side with enabled prospective teyrchains.
use super::*;
use pezkuwi_node_subsystem::messages::ChainApiMessage;
use pezkuwi_primitives::Header;
use rstest::rstest;
fn get_parent_hash(hash: Hash) -> Hash {
Hash::from_low_u64_be(hash.to_low_u64_be() + 1)
}
/// Handle a view update.
pub(super) async fn update_view(
expected_connected: Option<Vec<AuthorityDiscoveryId>>,
test_state: &TestState,
virtual_overseer: &mut VirtualOverseer,
new_view: Vec<(Hash, u32)>, // Hash and block number.
activated: u8, // How many new heads does this update contain?
) {
let new_view: HashMap<Hash, u32> = HashMap::from_iter(new_view);
let our_view = OurView::new(new_view.keys().map(|hash| *hash), 0);
overseer_send(
virtual_overseer,
CollatorProtocolMessage::NetworkBridgeUpdate(NetworkBridgeEvent::OurViewChange(our_view)),
)
.await;
for _ in 0..activated {
assert_matches!(
overseer_recv(virtual_overseer).await,
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
_,
RuntimeApiRequest::SessionIndexForChild(tx),
)) => {
tx.send(Ok(test_state.current_session_index())).unwrap();
}
);
// obtain the claim queue schedule.
let (leaf_hash, leaf_number) = assert_matches!(
overseer_recv(virtual_overseer).await,
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
parent,
RuntimeApiRequest::ClaimQueue(tx),
)) => {
tx.send(Ok(test_state.claim_queue.clone())).unwrap();
(parent, new_view.get(&parent).copied().expect("Unknown parent requested"))
}
);
let min_number = leaf_number.saturating_sub(SCHEDULING_LOOKAHEAD as u32 - 1);
let ancestry_len = leaf_number + 1 - min_number;
let ancestry_hashes = std::iter::successors(Some(leaf_hash), |h| Some(get_parent_hash(*h)))
.take(ancestry_len as usize);
let ancestry_numbers = (min_number..=leaf_number).rev();
let mut ancestry_iter = ancestry_hashes.clone().zip(ancestry_numbers).peekable();
if let Some((hash, number)) = ancestry_iter.next() {
assert_matches!(
overseer_recv_with_timeout(virtual_overseer, Duration::from_millis(50)).await.unwrap(),
AllMessages::ChainApi(ChainApiMessage::BlockHeader(.., tx)) => {
let header = Header {
parent_hash: get_parent_hash(hash),
number,
state_root: Hash::zero(),
extrinsics_root: Hash::zero(),
digest: Default::default(),
};
tx.send(Ok(Some(header))).unwrap();
}
);
assert_matches!(
overseer_recv_with_timeout(virtual_overseer, Duration::from_millis(50)).await.unwrap(),
AllMessages::RuntimeApi(
RuntimeApiMessage::Request(
..,
RuntimeApiRequest::SessionIndexForChild(
tx
)
)
) => {
tx.send(Ok(1)).unwrap();
}
);
assert_matches!(
overseer_recv_with_timeout(virtual_overseer, Duration::from_millis(50)).await.unwrap(),
AllMessages::RuntimeApi(
RuntimeApiMessage::Request(
..,
RuntimeApiRequest::SchedulingLookahead(
session_index,
tx
)
)
) => {
assert_eq!(session_index, 1);
tx.send(Ok(SCHEDULING_LOOKAHEAD as u32)).unwrap();
}
);
assert_matches!(
overseer_recv_with_timeout(virtual_overseer, Duration::from_millis(50)).await.unwrap(),
AllMessages::ChainApi(
ChainApiMessage::Ancestors {
k,
response_channel: tx,
..
}
) => {
assert_eq!(k, SCHEDULING_LOOKAHEAD - 1);
let hashes: Vec<_> = ancestry_hashes.clone().skip(1).into_iter().collect();
assert_eq!(k, hashes.len());
tx.send(Ok(hashes)).unwrap();
}
);
}
for _ in ancestry_iter.clone() {
assert_matches!(
overseer_recv_with_timeout(virtual_overseer, Duration::from_millis(50)).await.unwrap(),
AllMessages::RuntimeApi(
RuntimeApiMessage::Request(
..,
RuntimeApiRequest::SessionIndexForChild(
tx
)
)
) => {
tx.send(Ok(1)).unwrap();
}
);
}
let mut iter_clone = ancestry_iter.clone();
while let Some((hash, number)) = iter_clone.next() {
// May be `None` for the last element.
let parent_hash =
iter_clone.peek().map(|(h, _)| *h).unwrap_or_else(|| get_parent_hash(hash));
let Some(msg) =
overseer_peek_with_timeout(virtual_overseer, Duration::from_millis(50)).await
else {
return;
};
if !matches!(
&msg,
AllMessages::ChainApi(ChainApiMessage::BlockHeader(_hash, ..))
if *_hash == hash
) {
// Ancestry has already been cached for this leaf.
break;
}
assert_matches!(
overseer_recv_with_timeout(virtual_overseer, Duration::from_millis(50)).await.unwrap(),
AllMessages::ChainApi(ChainApiMessage::BlockHeader(.., tx)) => {
let header = Header {
parent_hash,
number,
state_root: Hash::zero(),
extrinsics_root: Hash::zero(),
digest: Default::default(),
};
tx.send(Ok(Some(header))).unwrap();
}
);
}
for (_core, _paras) in test_state
.claim_queue
.iter()
.filter(|(_, paras)| paras.contains(&test_state.para_id))
{
expect_determine_validator_group(virtual_overseer, &test_state).await;
}
for _ in ancestry_iter {
while let Some(msg) =
overseer_peek_with_timeout(virtual_overseer, Duration::from_millis(50)).await
{
if !matches!(
&msg,
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
_,
RuntimeApiRequest::ClaimQueue(_),
))
) && !matches!(
&msg,
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
_,
RuntimeApiRequest::CandidateEvents(_),
))
) && !matches!(
&msg,
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
_,
RuntimeApiRequest::SessionIndexForChild(_),
))
) {
break;
}
if matches!(
&msg,
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
_,
RuntimeApiRequest::SessionIndexForChild(_),
))
) {
for (_core, _paras) in test_state
.claim_queue
.iter()
.filter(|(_, paras)| paras.contains(&test_state.para_id))
{
expect_determine_validator_group(virtual_overseer, &test_state).await;
}
break;
}
match overseer_recv_with_timeout(virtual_overseer, Duration::from_millis(50))
.await
.unwrap()
{
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
_,
RuntimeApiRequest::ClaimQueue(tx),
)) => {
tx.send(Ok(test_state.claim_queue.clone())).unwrap();
},
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
..,
RuntimeApiRequest::CandidateEvents(tx),
)) => {
tx.send(Ok(vec![])).unwrap();
},
_ => {
unimplemented!()
},
}
}
}
}
if let Some(expected_connected) = expected_connected {
check_connected_to_validators(virtual_overseer, expected_connected).await;
}
}
/// Check that the next received message is a `Declare` message.
pub(super) async fn expect_declare_msg(
virtual_overseer: &mut VirtualOverseer,
test_state: &TestState,
peer: &PeerId,
) {
assert_matches!(
overseer_recv(virtual_overseer).await,
AllMessages::NetworkBridgeTx(NetworkBridgeTxMessage::SendCollationMessage(
to,
CollationProtocols::V2(protocol_v2::CollationProtocol::CollatorProtocol(
wire_message,
)),
)) => {
assert_eq!(to[0], *peer);
assert_matches!(
wire_message,
protocol_v2::CollatorProtocolMessage::Declare(
collator_id,
para_id,
signature,
) => {
assert!(signature.verify(
&*protocol_v2::declare_signature_payload(&test_state.local_peer_id),
&collator_id),
);
assert_eq!(collator_id, test_state.collator_pair.public());
assert_eq!(para_id, test_state.para_id);
}
);
}
);
}
/// Test that a collator distributes a collation from the allowed ancestry
/// to correct validators group.
/// Run once with validators sending their view first and then the collator setting their own
/// view first.
#[rstest]
#[case(true)]
#[case(false)]
fn distribute_collation_from_implicit_view(#[case] validator_sends_view_first: bool) {
let head_a = Hash::from_low_u64_be(126);
let head_a_num: u32 = 66;
// Grandparent of head `a`.
let head_b = Hash::from_low_u64_be(128);
let head_b_num: u32 = 64;
// Grandparent of head `b`.
let head_c = Hash::from_low_u64_be(130);
let head_c_num = 62;
let group_rotation_info = GroupRotationInfo {
session_start_block: head_c_num - 2,
group_rotation_frequency: 3,
now: head_c_num,
};
let mut test_state = TestState::default();
test_state.group_rotation_info = group_rotation_info;
let local_peer_id = test_state.local_peer_id;
let collator_pair = test_state.collator_pair.clone();
test_harness(
local_peer_id,
collator_pair,
ReputationAggregator::new(|_| true),
|mut test_harness| async move {
let virtual_overseer = &mut test_harness.virtual_overseer;
overseer_send(virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups).await;
// Set collating para id.
overseer_send(virtual_overseer, CollatorProtocolMessage::CollateOn(test_state.para_id))
.await;
if validator_sends_view_first {
// Activate leaf `c` to accept at least the collation.
update_view(
Some(test_state.current_group_validator_authority_ids()),
&test_state,
virtual_overseer,
vec![(head_c, head_c_num)],
1,
)
.await;
} else {
// Activated leaf is `b`, but the collation will be based on `c`.
update_view(
Some(test_state.current_group_validator_authority_ids()),
&test_state,
virtual_overseer,
vec![(head_b, head_b_num)],
1,
)
.await;
}
let validator_peer_ids = test_state.current_group_validator_peer_ids();
for (val, peer) in test_state
.current_group_validator_authority_ids()
.into_iter()
.zip(validator_peer_ids.clone())
{
connect_peer(virtual_overseer, peer, CollationVersion::V2, Some(val.clone())).await;
}
// Collator declared itself to each peer.
for peer_id in &validator_peer_ids {
expect_declare_msg(virtual_overseer, &test_state, peer_id).await;
}
let pov = PoV { block_data: BlockData(vec![1, 2, 3]) };
let parent_head_data_hash = Hash::repeat_byte(0xAA);
let candidate = TestCandidateBuilder {
para_id: test_state.para_id,
relay_parent: head_c,
pov_hash: pov.hash(),
..Default::default()
}
.build();
let DistributeCollation { candidate, pov_block: _ } =
distribute_collation_with_receipt(
virtual_overseer,
test_state.current_group_validator_authority_ids(),
candidate,
pov,
parent_head_data_hash,
)
.await;
let candidate_hash = candidate.hash();
// Update peer views.
for peer_id in &validator_peer_ids {
send_peer_view_change(virtual_overseer, peer_id, vec![head_b]).await;
if !validator_sends_view_first {
expect_advertise_collation_msg(
virtual_overseer,
&[*peer_id],
head_c,
vec![candidate_hash],
)
.await;
}
}
if validator_sends_view_first {
// Activated leaf is `b`, but the collation will be based on `c`.
update_view(None, &test_state, virtual_overseer, vec![(head_b, head_b_num)], 1)
.await;
for _ in &validator_peer_ids {
expect_advertise_collation_msg(
virtual_overseer,
&validator_peer_ids,
head_c,
vec![candidate_hash],
)
.await;
}
check_connected_to_validators(
virtual_overseer,
test_state.current_group_validator_authority_ids(),
)
.await;
}
// Head `c` goes out of view.
// Build a different candidate for this relay parent and attempt to distribute it.
update_view(
Some(test_state.current_group_validator_authority_ids()),
&test_state,
virtual_overseer,
vec![(head_a, head_a_num)],
1,
)
.await;
let pov = PoV { block_data: BlockData(vec![4, 5, 6]) };
let parent_head_data_hash = Hash::repeat_byte(0xBB);
let candidate = TestCandidateBuilder {
para_id: test_state.para_id,
relay_parent: head_c,
pov_hash: pov.hash(),
..Default::default()
}
.build();
overseer_send(
virtual_overseer,
CollatorProtocolMessage::DistributeCollation {
candidate_receipt: candidate.clone(),
parent_head_data_hash,
pov: pov.clone(),
parent_head_data: HeadData(vec![1, 2, 3]),
result_sender: None,
core_index: CoreIndex(0),
},
)
.await;
check_connected_to_validators(
virtual_overseer,
test_state.current_group_validator_authority_ids(),
)
.await;
// Parent out of view, nothing happens.
assert!(overseer_recv_with_timeout(virtual_overseer, Duration::from_millis(100))
.await
.is_none());
test_harness
},
);
}
/// Tests that collator respects the per relay parent limit of collations, which is equal to the
/// number of assignments they have in the claim queue for that core.
#[test]
fn distribute_collation_up_to_limit() {
let mut test_state = TestState::default();
// Claim queue has 4 assignments for our paraid on core 0, 1 assignment for another paraid on
// core 1. Let's replace one of our assignments on core 0.
*test_state.claim_queue.get_mut(&CoreIndex(0)).unwrap().get_mut(1).unwrap() = ParaId::from(3);
let expected_assignments = SCHEDULING_LOOKAHEAD - 1;
let local_peer_id = test_state.local_peer_id;
let collator_pair = test_state.collator_pair.clone();
test_harness(
local_peer_id,
collator_pair,
ReputationAggregator::new(|_| true),
|mut test_harness| async move {
let virtual_overseer = &mut test_harness.virtual_overseer;
let head_a = Hash::from_low_u64_be(128);
let head_a_num: u32 = 64;
// Grandparent of head `a`.
let head_b = Hash::from_low_u64_be(130);
overseer_send(virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups).await;
// Set collating para id.
overseer_send(virtual_overseer, CollatorProtocolMessage::CollateOn(test_state.para_id))
.await;
// Activated leaf is `a`, but the collation will be based on `b`.
update_view(
Some(test_state.current_group_validator_authority_ids()),
&test_state,
virtual_overseer,
vec![(head_a, head_a_num)],
1,
)
.await;
for i in 0..expected_assignments {
let pov = PoV { block_data: BlockData(vec![i as u8]) };
let parent_head_data_hash = Hash::repeat_byte(0xAA);
let candidate = TestCandidateBuilder {
para_id: test_state.para_id,
relay_parent: head_b,
pov_hash: pov.hash(),
core_index: CoreIndex(0),
..Default::default()
}
.build();
distribute_collation_with_receipt(
virtual_overseer,
test_state.current_group_validator_authority_ids(),
candidate,
pov,
parent_head_data_hash,
)
.await;
}
let pov = PoV { block_data: BlockData(vec![10, 12, 6]) };
let parent_head_data_hash = Hash::repeat_byte(0xBB);
let candidate = TestCandidateBuilder {
para_id: test_state.para_id,
relay_parent: head_b,
pov_hash: pov.hash(),
core_index: CoreIndex(0),
..Default::default()
}
.build();
overseer_send(
virtual_overseer,
CollatorProtocolMessage::DistributeCollation {
candidate_receipt: candidate.clone(),
parent_head_data_hash,
pov: pov.clone(),
parent_head_data: HeadData(vec![1, 2, 3]),
result_sender: None,
core_index: CoreIndex(0),
},
)
.await;
check_connected_to_validators(
virtual_overseer,
test_state.current_group_validator_authority_ids(),
)
.await;
// Limit has been reached.
assert!(overseer_recv_with_timeout(virtual_overseer, Duration::from_millis(100))
.await
.is_none());
// Let's also try on core 1, where we don't have any assignments.
let pov = PoV { block_data: BlockData(vec![10, 12, 6]) };
let parent_head_data_hash = Hash::repeat_byte(0xBB);
let candidate = TestCandidateBuilder {
para_id: test_state.para_id,
relay_parent: head_b,
pov_hash: pov.hash(),
core_index: CoreIndex(1),
..Default::default()
}
.build();
overseer_send(
virtual_overseer,
CollatorProtocolMessage::DistributeCollation {
candidate_receipt: candidate.clone(),
parent_head_data_hash,
pov: pov.clone(),
parent_head_data: HeadData(vec![1, 2, 3]),
result_sender: None,
core_index: CoreIndex(1),
},
)
.await;
check_connected_to_validators(
virtual_overseer,
test_state.current_group_validator_authority_ids(),
)
.await;
assert!(overseer_recv_with_timeout(virtual_overseer, Duration::from_millis(100))
.await
.is_none());
test_harness
},
)
}
/// Tests that collator send the parent head data in
/// case the para is assigned to multiple cores (elastic scaling).
#[test]
fn send_parent_head_data_for_elastic_scaling() {
let test_state = TestState::with_elastic_scaling();
let local_peer_id = test_state.local_peer_id;
let collator_pair = test_state.collator_pair.clone();
test_harness(
local_peer_id,
collator_pair,
ReputationAggregator::new(|_| true),
|test_harness| async move {
let mut virtual_overseer = test_harness.virtual_overseer;
let mut req_v2_cfg = test_harness.req_v2_cfg;
let head_b = Hash::from_low_u64_be(129);
let head_b_num: u32 = 63;
overseer_send(&mut virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups)
.await;
// Set collating para id.
overseer_send(
&mut virtual_overseer,
CollatorProtocolMessage::CollateOn(test_state.para_id),
)
.await;
let expected_connected = [CoreIndex(0), CoreIndex(2), CoreIndex(3)]
.into_iter()
.map(|core| test_state.validator_authority_ids_for_core(core))
.fold(HashSet::new(), |mut acc, res| {
acc.extend(res.into_iter());
acc
})
.into_iter()
.collect::<Vec<_>>();
update_view(
Some(expected_connected.clone()),
&test_state,
&mut virtual_overseer,
vec![(head_b, head_b_num)],
1,
)
.await;
let pov_data = PoV { block_data: BlockData(vec![1 as u8]) };
let candidate = TestCandidateBuilder {
para_id: test_state.para_id,
relay_parent: head_b,
pov_hash: pov_data.hash(),
..Default::default()
}
.build();
let phd = HeadData(vec![1, 2, 3]);
let phdh = phd.hash();
distribute_collation_with_receipt(
&mut virtual_overseer,
expected_connected,
candidate.clone(),
pov_data.clone(),
phdh,
)
.await;
let peer = test_state.validator_peer_id[0];
let validator_id = test_state.current_group_validator_authority_ids()[0].clone();
connect_peer(
&mut virtual_overseer,
peer,
CollationVersion::V2,
Some(validator_id.clone()),
)
.await;
expect_declare_msg(&mut virtual_overseer, &test_state, &peer).await;
send_peer_view_change(&mut virtual_overseer, &peer, vec![head_b]).await;
let hashes: Vec<_> = vec![candidate.hash()];
expect_advertise_collation_msg(&mut virtual_overseer, &[peer], head_b, hashes).await;
let (pending_response, rx) = oneshot::channel();
req_v2_cfg
.inbound_queue
.as_mut()
.unwrap()
.send(RawIncomingRequest {
peer,
payload: CollationFetchingRequest {
relay_parent: head_b,
para_id: test_state.para_id,
candidate_hash: candidate.hash(),
}
.encode(),
pending_response,
})
.await
.unwrap();
assert_matches!(
rx.await,
Ok(full_response) => {
let response: CollationFetchingResponse =
CollationFetchingResponse::decode(
&mut full_response.result
.expect("We should have a proper answer").as_ref()
).expect("Decoding should work");
assert_matches!(
response,
CollationFetchingResponse::CollationWithParentHeadData {
receipt, pov, parent_head_data
} => {
assert_eq!(receipt, candidate);
assert_eq!(pov, pov_data);
assert_eq!(parent_head_data, phd);
}
);
}
);
TestHarness { virtual_overseer, req_v2_cfg }
},
)
}
/// Tests that collator correctly handles peer V2 requests.
#[test]
fn advertise_and_send_collation_by_hash() {
let test_state = TestState::default();
let local_peer_id = test_state.local_peer_id;
let collator_pair = test_state.collator_pair.clone();
test_harness(
local_peer_id,
collator_pair,
ReputationAggregator::new(|_| true),
|test_harness| async move {
let mut virtual_overseer = test_harness.virtual_overseer;
let mut req_v2_cfg = test_harness.req_v2_cfg;
let head_a = Hash::from_low_u64_be(128);
let head_a_num: u32 = 64;
// Parent of head `a`.
let head_b = Hash::from_low_u64_be(129);
let head_b_num: u32 = 63;
overseer_send(&mut virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups)
.await;
// Set collating para id.
overseer_send(
&mut virtual_overseer,
CollatorProtocolMessage::CollateOn(test_state.para_id),
)
.await;
update_view(
Some(test_state.current_group_validator_authority_ids()),
&test_state,
&mut virtual_overseer,
vec![(head_b, head_b_num)],
1,
)
.await;
update_view(
Some(test_state.current_group_validator_authority_ids()),
&test_state,
&mut virtual_overseer,
vec![(head_a, head_a_num)],
1,
)
.await;
let candidates: Vec<_> = (0..2)
.map(|i| {
let pov = PoV { block_data: BlockData(vec![i as u8]) };
let candidate = TestCandidateBuilder {
para_id: test_state.para_id,
relay_parent: head_b,
pov_hash: pov.hash(),
..Default::default()
}
.build();
(candidate, pov)
})
.collect();
for (candidate, pov) in &candidates {
distribute_collation_with_receipt(
&mut virtual_overseer,
test_state.current_group_validator_authority_ids(),
candidate.clone(),
pov.clone(),
Hash::zero(),
)
.await;
}
let peer = test_state.validator_peer_id[0];
let validator_id = test_state.current_group_validator_authority_ids()[0].clone();
connect_peer(
&mut virtual_overseer,
peer,
CollationVersion::V2,
Some(validator_id.clone()),
)
.await;
expect_declare_msg(&mut virtual_overseer, &test_state, &peer).await;
// Head `b` is not a leaf, but both advertisements are still relevant.
send_peer_view_change(&mut virtual_overseer, &peer, vec![head_b]).await;
let hashes: Vec<_> = candidates.iter().map(|(candidate, _)| candidate.hash()).collect();
expect_advertise_collation_msg(&mut virtual_overseer, &[peer], head_b, hashes).await;
for (candidate, pov_block) in candidates {
let (pending_response, rx) = oneshot::channel();
req_v2_cfg
.inbound_queue
.as_mut()
.unwrap()
.send(RawIncomingRequest {
peer,
payload: CollationFetchingRequest {
relay_parent: head_b,
para_id: test_state.para_id,
candidate_hash: candidate.hash(),
}
.encode(),
pending_response,
})
.await
.unwrap();
assert_matches!(
rx.await,
Ok(full_response) => {
// Response is the same for v2.
let (receipt, pov) = decode_collation_response(
full_response.result
.expect("We should have a proper answer").as_ref()
);
assert_eq!(receipt, candidate);
assert_eq!(pov, pov_block);
}
);
}
TestHarness { virtual_overseer, req_v2_cfg }
},
)
}
@@ -0,0 +1,204 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//! The Collator Protocol allows collators and validators talk to each other.
//! This subsystem implements both sides of the collator protocol.
#![deny(missing_docs)]
#![deny(unused_crate_dependencies)]
#![recursion_limit = "256"]
use std::{
collections::HashSet,
time::{Duration, Instant},
};
use futures::{
stream::{FusedStream, StreamExt},
FutureExt, TryFutureExt,
};
use pezkuwi_node_subsystem_util::reputation::ReputationAggregator;
use sp_keystore::KeystorePtr;
use pezkuwi_node_network_protocol::{
request_response::{v2 as protocol_v2, IncomingRequestReceiver},
PeerId, UnifiedReputationChange as Rep,
};
use pezkuwi_primitives::CollatorPair;
use pezkuwi_node_subsystem::{errors::SubsystemError, overseer, DummySubsystem, SpawnedSubsystem};
mod collator_side;
mod validator_side;
#[cfg(feature = "experimental-collator-protocol")]
mod validator_side_experimental;
const LOG_TARGET: &'static str = "teyrchain::collator-protocol";
const LOG_TARGET_STATS: &'static str = "teyrchain::collator-protocol::stats";
/// A collator eviction policy - how fast to evict collators which are inactive.
#[derive(Debug, Clone, Copy)]
pub struct CollatorEvictionPolicy {
/// How fast to evict collators who are inactive.
pub inactive_collator: Duration,
/// How fast to evict peers which don't declare their para.
pub undeclared: Duration,
}
impl Default for CollatorEvictionPolicy {
fn default() -> Self {
CollatorEvictionPolicy {
inactive_collator: Duration::from_secs(24),
undeclared: Duration::from_secs(1),
}
}
}
/// What side of the collator protocol is being engaged
pub enum ProtocolSide {
/// Validators operate on the relay chain.
Validator {
/// The keystore holding validator keys.
keystore: KeystorePtr,
/// An eviction policy for inactive peers or validators.
eviction_policy: CollatorEvictionPolicy,
/// Prometheus metrics for validators.
metrics: validator_side::Metrics,
/// List of invulnerable collators which is handled with a priority.
invulnerables: HashSet<PeerId>,
/// Override for `HOLD_OFF_DURATION` constant .
collator_protocol_hold_off: Option<Duration>,
},
/// Experimental variant of the validator side. Do not use in production.
#[cfg(feature = "experimental-collator-protocol")]
ValidatorExperimental {
/// The keystore holding validator keys.
keystore: KeystorePtr,
/// Prometheus metrics for validators.
metrics: validator_side_experimental::Metrics,
},
/// Collators operate on a teyrchain.
Collator {
/// Local peer id.
peer_id: PeerId,
/// Teyrchain collator pair.
collator_pair: CollatorPair,
/// Receiver for v2 collation fetching requests.
request_receiver_v2: IncomingRequestReceiver<protocol_v2::CollationFetchingRequest>,
/// Metrics.
metrics: collator_side::Metrics,
},
/// No protocol side, just disable it.
None,
}
/// The collator protocol subsystem.
pub struct CollatorProtocolSubsystem {
protocol_side: ProtocolSide,
}
#[overseer::contextbounds(CollatorProtocol, prefix = self::overseer)]
impl CollatorProtocolSubsystem {
/// Start the collator protocol.
/// If `id` is `Some` this is a collator side of the protocol.
/// If `id` is `None` this is a validator side of the protocol.
/// Caller must provide a registry for prometheus metrics.
pub fn new(protocol_side: ProtocolSide) -> Self {
Self { protocol_side }
}
}
#[overseer::subsystem(CollatorProtocol, error=SubsystemError, prefix=self::overseer)]
impl<Context> CollatorProtocolSubsystem {
fn start(self, ctx: Context) -> SpawnedSubsystem {
let future = match self.protocol_side {
ProtocolSide::Validator {
keystore,
eviction_policy,
metrics,
invulnerables,
collator_protocol_hold_off,
} => {
gum::trace!(
target: LOG_TARGET,
?invulnerables,
?collator_protocol_hold_off,
"AH collator protocol params",
);
validator_side::run(
ctx,
keystore,
eviction_policy,
metrics,
invulnerables,
collator_protocol_hold_off,
)
.map_err(|e| SubsystemError::with_origin("collator-protocol", e))
.boxed()
},
#[cfg(feature = "experimental-collator-protocol")]
ProtocolSide::ValidatorExperimental { keystore, metrics } =>
validator_side_experimental::run(ctx, keystore, metrics)
.map_err(|e| SubsystemError::with_origin("collator-protocol", e))
.boxed(),
ProtocolSide::Collator { peer_id, collator_pair, request_receiver_v2, metrics } =>
collator_side::run(ctx, peer_id, collator_pair, request_receiver_v2, metrics)
.map_err(|e| SubsystemError::with_origin("collator-protocol", e))
.boxed(),
ProtocolSide::None => return DummySubsystem.start(ctx),
};
SpawnedSubsystem { name: "collator-protocol-subsystem", future }
}
}
/// Modify the reputation of a peer based on its behavior.
async fn modify_reputation(
reputation: &mut ReputationAggregator,
sender: &mut impl overseer::CollatorProtocolSenderTrait,
peer: PeerId,
rep: Rep,
) {
gum::trace!(
target: LOG_TARGET,
rep = ?rep,
peer_id = %peer,
"reputation change for peer",
);
reputation.modify(sender, peer, rep).await;
}
/// Wait until tick and return the timestamp for the following one.
async fn wait_until_next_tick(last_poll: Instant, period: Duration) -> Instant {
let now = Instant::now();
let next_poll = last_poll + period;
if next_poll > now {
futures_timer::Delay::new(next_poll - now).await
}
Instant::now()
}
/// Returns an infinite stream that yields with an interval of `period`.
fn tick_stream(period: Duration) -> impl FusedStream<Item = ()> {
futures::stream::unfold(Instant::now(), move |next_check| async move {
Some(((), wait_until_next_tick(next_check, period).await))
})
.fuse()
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,392 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//! Primitives for tracking collations-related data.
//!
//! Usually a path of collations is as follows:
//! 1. First, collation must be advertised by collator.
//! 2. The validator inspects the claim queue and decides if the collation should be fetched
//! based on the entries there. A teyrchain can't have more fetched collations than the
//! entries in the claim queue at a specific relay parent. When calculating this limit the
//! validator counts all advertisements within its view not just at the relay parent.
//! 3. If the advertisement was accepted, it's queued for fetch (per relay parent).
//! 4. Once it's requested, the collation is said to be pending fetch
//! (`CollationStatus::Fetching`).
//! 5. Pending fetch collation becomes pending validation
//! (`CollationStatus::WaitingOnValidation`) once received, we send it to backing for
//! validation.
//! 6. If it turns to be invalid or async backing allows seconding another candidate, carry on
//! with the next advertisement, otherwise we're done with this relay parent.
//!
//! ┌───────────────────────────────────┐
//! └─▶Waiting ─▶ Fetching ─▶ WaitingOnValidation
use std::{
collections::{BTreeMap, VecDeque},
future::Future,
pin::Pin,
task::Poll,
};
use futures::{future::BoxFuture, FutureExt};
use pezkuwi_node_network_protocol::{
peer_set::CollationVersion,
request_response::{outgoing::RequestError, v1 as request_v1, OutgoingResult},
PeerId,
};
use pezkuwi_node_primitives::PoV;
use pezkuwi_node_subsystem_util::metrics::prometheus::prometheus::HistogramTimer;
use pezkuwi_primitives::{
CandidateHash, CandidateReceiptV2 as CandidateReceipt, CollatorId, Hash, HeadData,
Id as ParaId, PersistedValidationData,
};
use tokio_util::sync::CancellationToken;
use super::error::SecondingError;
use crate::LOG_TARGET;
/// Candidate supplied with a para head it's built on top of.
#[derive(Debug, Copy, Clone, Hash, Eq, PartialEq)]
pub struct ProspectiveCandidate {
/// Candidate hash.
pub candidate_hash: CandidateHash,
/// Parent head-data hash as supplied in advertisement.
pub parent_head_data_hash: Hash,
}
impl ProspectiveCandidate {
pub fn candidate_hash(&self) -> CandidateHash {
self.candidate_hash
}
}
/// Identifier of a fetched collation.
#[derive(Debug, Clone, Hash, Eq, PartialEq)]
pub struct FetchedCollation {
/// Candidate's relay parent.
pub relay_parent: Hash,
/// Teyrchain id.
pub para_id: ParaId,
/// Candidate hash.
pub candidate_hash: CandidateHash,
}
impl From<&CandidateReceipt<Hash>> for FetchedCollation {
fn from(receipt: &CandidateReceipt<Hash>) -> Self {
let descriptor = receipt.descriptor();
Self {
relay_parent: descriptor.relay_parent(),
para_id: descriptor.para_id(),
candidate_hash: receipt.hash(),
}
}
}
/// Identifier of a collation being requested.
#[derive(Debug, Copy, Clone, Hash, Eq, PartialEq)]
pub struct PendingCollation {
/// Candidate's relay parent.
pub relay_parent: Hash,
/// Teyrchain id.
pub para_id: ParaId,
/// Peer that advertised this collation.
pub peer_id: PeerId,
/// Optional candidate hash and parent head-data hash if were
/// supplied in advertisement.
pub prospective_candidate: Option<ProspectiveCandidate>,
/// Hash of the candidate's commitments.
pub commitments_hash: Option<Hash>,
}
impl PendingCollation {
pub fn new(
relay_parent: Hash,
para_id: ParaId,
peer_id: &PeerId,
prospective_candidate: Option<ProspectiveCandidate>,
) -> Self {
Self {
relay_parent,
para_id,
peer_id: *peer_id,
prospective_candidate,
commitments_hash: None,
}
}
}
/// An identifier for a fetched collation that was blocked from being seconded because we don't have
/// access to the parent's HeadData. Can be retried once the candidate outputting this head data is
/// seconded.
#[derive(Debug, Clone, Eq, PartialEq, Hash)]
pub struct BlockedCollationId {
/// Para id.
pub para_id: ParaId,
/// Hash of the parent head data.
pub parent_head_data_hash: Hash,
}
/// Performs a sanity check between advertised and fetched collations.
pub fn fetched_collation_sanity_check(
advertised: &PendingCollation,
fetched: &CandidateReceipt,
persisted_validation_data: &PersistedValidationData,
maybe_parent_head_and_hash: Option<(HeadData, Hash)>,
) -> Result<(), SecondingError> {
if persisted_validation_data.hash() != fetched.descriptor().persisted_validation_data_hash() {
return Err(SecondingError::PersistedValidationDataMismatch);
}
if advertised
.prospective_candidate
.map_or(false, |pc| pc.candidate_hash() != fetched.hash())
{
return Err(SecondingError::CandidateHashMismatch);
}
if advertised.relay_parent != fetched.descriptor.relay_parent() {
return Err(SecondingError::RelayParentMismatch);
}
if maybe_parent_head_and_hash.map_or(false, |(head, hash)| head.hash() != hash) {
return Err(SecondingError::ParentHeadDataMismatch);
}
Ok(())
}
/// Identifier for a requested collation and the respective collator that advertised it.
#[derive(Debug, Clone)]
pub struct CollationEvent {
/// Collator id.
pub collator_id: CollatorId,
/// The network protocol version the collator is using.
pub collator_protocol_version: CollationVersion,
/// The requested collation data.
pub pending_collation: PendingCollation,
}
/// Fetched collation data.
#[derive(Debug, Clone)]
pub struct PendingCollationFetch {
/// Collation identifier.
pub collation_event: CollationEvent,
/// Candidate receipt.
pub candidate_receipt: CandidateReceipt,
/// Proof of validity.
pub pov: PoV,
/// Optional teyrchain parent head data.
/// Only needed for elastic scaling.
pub maybe_parent_head_data: Option<HeadData>,
}
/// The status of the collations in [`CollationsPerRelayParent`].
#[derive(Debug, Clone, Copy)]
pub enum CollationStatus {
/// We are waiting for a collation to be advertised to us.
Waiting,
/// We are currently fetching a collation for the specified `ParaId`.
Fetching(ParaId),
/// We are waiting that a collation is being validated.
WaitingOnValidation,
}
impl Default for CollationStatus {
fn default() -> Self {
Self::Waiting
}
}
impl CollationStatus {
/// Downgrades to `Waiting`
pub fn back_to_waiting(&mut self) {
*self = Self::Waiting
}
}
/// The number of claims in the claim queue and seconded candidates count for a specific `ParaId`.
#[derive(Default, Debug)]
struct CandidatesStatePerPara {
/// How many collations have been seconded.
pub seconded_per_para: usize,
// Claims in the claim queue for the `ParaId`.
pub claims_per_para: usize,
}
/// Information about collations per relay parent.
pub struct Collations {
/// What is the current status in regards to a collation for this relay parent?
pub status: CollationStatus,
/// Collator we're fetching from, optionally which candidate was requested.
///
/// This is the currently last started fetch, which did not exceed `MAX_UNSHARED_DOWNLOAD_TIME`
/// yet.
pub fetching_from: Option<(CollatorId, Option<CandidateHash>)>,
/// Collation that were advertised to us, but we did not yet request or fetch. Grouped by
/// `ParaId`.
waiting_queue: BTreeMap<ParaId, VecDeque<(PendingCollation, CollatorId)>>,
/// Number of seconded candidates and claims in the claim queue per `ParaId`.
candidates_state: BTreeMap<ParaId, CandidatesStatePerPara>,
}
impl Collations {
pub(super) fn new(group_assignments: &Vec<ParaId>) -> Self {
let mut candidates_state = BTreeMap::<ParaId, CandidatesStatePerPara>::new();
for para_id in group_assignments {
candidates_state.entry(*para_id).or_default().claims_per_para += 1;
}
Self {
status: Default::default(),
fetching_from: None,
waiting_queue: Default::default(),
candidates_state,
}
}
/// Note a seconded collation for a given para.
pub(super) fn note_seconded(&mut self, para_id: ParaId) {
self.candidates_state.entry(para_id).or_default().seconded_per_para += 1;
gum::trace!(
target: LOG_TARGET,
?para_id,
new_count=self.candidates_state.entry(para_id).or_default().seconded_per_para,
"Note seconded."
);
self.status.back_to_waiting();
}
/// Adds a new collation to the waiting queue for the relay parent. This function doesn't
/// perform any limits check. The caller should assure that the collation limit is respected.
pub(super) fn add_to_waiting_queue(&mut self, collation: (PendingCollation, CollatorId)) {
self.waiting_queue.entry(collation.0.para_id).or_default().push_back(collation);
}
/// Picks a collation to fetch from the waiting queue.
/// When fetching collations we need to ensure that each teyrchain has got a fair core time
/// share depending on its assignments in the claim queue. This means that the number of
/// collations seconded per teyrchain should ideally be equal to the number of claims for the
/// particular teyrchain in the claim queue.
///
/// To achieve this each seconded collation is mapped to an entry from the claim queue. The next
/// fetch is the first unfulfilled entry from the claim queue for which there is an
/// advertisement.
///
/// `unfulfilled_claim_queue_entries` represents all claim queue entries which are still not
/// fulfilled.
pub(super) fn pick_a_collation_to_fetch(
&mut self,
unfulfilled_claim_queue_entries: Vec<ParaId>,
) -> Option<(PendingCollation, CollatorId)> {
gum::trace!(
target: LOG_TARGET,
waiting_queue=?self.waiting_queue,
candidates_state=?self.candidates_state,
?unfulfilled_claim_queue_entries,
"Pick a collation to fetch."
);
for assignment in unfulfilled_claim_queue_entries {
// if there is an unfulfilled assignment - return it
if let Some(collation) = self
.waiting_queue
.get_mut(&assignment)
.and_then(|collations| collations.pop_front())
{
return Some(collation);
}
}
None
}
pub(super) fn seconded_for_para(&self, para_id: &ParaId) -> usize {
self.candidates_state
.get(&para_id)
.map(|state| state.seconded_per_para)
.unwrap_or_default()
}
pub(super) fn queued_for_para(&self, para_id: &ParaId) -> usize {
self.waiting_queue.get(para_id).map(|queue| queue.len()).unwrap_or_default()
}
}
// Any error that can occur when awaiting a collation fetch response.
#[derive(Debug, thiserror::Error)]
pub(super) enum CollationFetchError {
#[error("Future was cancelled.")]
Cancelled,
#[error("{0}")]
Request(#[from] RequestError),
}
/// Future that concludes when the collator has responded to our collation fetch request
/// or the request was cancelled by the validator.
pub(super) struct CollationFetchRequest {
/// Info about the requested collation.
pub pending_collation: PendingCollation,
/// Collator id.
pub collator_id: CollatorId,
/// The network protocol version the collator is using.
pub collator_protocol_version: CollationVersion,
/// Responses from collator.
pub from_collator: BoxFuture<'static, OutgoingResult<request_v1::CollationFetchingResponse>>,
/// Handle used for checking if this request was cancelled.
pub cancellation_token: CancellationToken,
/// A metric histogram for the lifetime of the request
pub _lifetime_timer: Option<HistogramTimer>,
}
impl Future for CollationFetchRequest {
type Output = (
CollationEvent,
std::result::Result<request_v1::CollationFetchingResponse, CollationFetchError>,
);
fn poll(mut self: Pin<&mut Self>, cx: &mut std::task::Context<'_>) -> Poll<Self::Output> {
// First check if this fetch request was cancelled.
let cancelled = match std::pin::pin!(self.cancellation_token.cancelled()).poll(cx) {
Poll::Ready(()) => true,
Poll::Pending => false,
};
if cancelled {
return Poll::Ready((
CollationEvent {
collator_protocol_version: self.collator_protocol_version,
collator_id: self.collator_id.clone(),
pending_collation: self.pending_collation,
},
Err(CollationFetchError::Cancelled),
));
}
let res = self.from_collator.poll_unpin(cx).map(|res| {
(
CollationEvent {
collator_protocol_version: self.collator_protocol_version,
collator_id: self.collator_id.clone(),
pending_collation: self.pending_collation,
},
res.map_err(CollationFetchError::Request),
)
});
res
}
}
@@ -0,0 +1,140 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use fatality::thiserror::Error;
use futures::channel::oneshot;
use pezkuwi_node_subsystem::RuntimeApiError;
use pezkuwi_node_subsystem_util::backing_implicit_view;
use pezkuwi_primitives::CandidateDescriptorVersion;
/// General result.
pub type Result<T> = std::result::Result<T, Error>;
/// General subsystem error.
#[derive(Error, Debug)]
pub enum Error {
#[error(transparent)]
ImplicitViewFetchError(backing_implicit_view::FetchError),
#[error("Response receiver for active validators request cancelled")]
CancelledActiveValidators(oneshot::Canceled),
#[error("Response receiver for validator groups request cancelled")]
CancelledValidatorGroups(oneshot::Canceled),
#[error("Response receiver for session index request cancelled")]
CancelledSessionIndex(oneshot::Canceled),
#[error("Response receiver for claim queue request cancelled")]
CancelledClaimQueue(oneshot::Canceled),
#[error("Response receiver for node features request cancelled")]
CancelledNodeFeatures(oneshot::Canceled),
#[error("No state for the relay parent")]
RelayParentStateNotFound,
#[error("Error while accessing Runtime API")]
RuntimeApi(#[from] RuntimeApiError),
}
/// An error occurred when attempting to start seconding a candidate.
#[derive(Debug, Error)]
pub enum SecondingError {
#[error("Error while accessing Runtime API")]
RuntimeApi(#[from] RuntimeApiError),
#[error("Response receiver for persisted validation data request cancelled")]
CancelledRuntimePersistedValidationData(oneshot::Canceled),
#[error("Response receiver for prospective validation data request cancelled")]
CancelledProspectiveValidationData(oneshot::Canceled),
#[error("Persisted validation data is not available")]
PersistedValidationDataNotFound,
#[error("Persisted validation data hash doesn't match one in the candidate receipt.")]
PersistedValidationDataMismatch,
#[error("Candidate hash doesn't match the advertisement")]
CandidateHashMismatch,
#[error("Relay parent hash doesn't match the advertisement")]
RelayParentMismatch,
#[error("Received duplicate collation from the peer")]
Duplicate,
#[error("The provided parent head data does not match the hash")]
ParentHeadDataMismatch,
#[error("Core index {0} present in descriptor is different than the assigned core {1}")]
InvalidCoreIndex(u32, u32),
#[error("Session index {0} present in descriptor is different than the expected one {1}")]
InvalidSessionIndex(u32, u32),
#[error("Invalid candidate receipt version {0:?}")]
InvalidReceiptVersion(CandidateDescriptorVersion),
}
impl SecondingError {
/// Returns true if an error indicates that a peer is malicious.
pub fn is_malicious(&self) -> bool {
use SecondingError::*;
matches!(
self,
PersistedValidationDataMismatch |
CandidateHashMismatch |
RelayParentMismatch |
ParentHeadDataMismatch |
InvalidCoreIndex(_, _) |
InvalidSessionIndex(_, _) |
InvalidReceiptVersion(_)
)
}
}
/// Failed to request a collation due to an error.
#[derive(Debug, Error)]
pub enum FetchError {
#[error("Collation was not previously advertised")]
NotAdvertised,
#[error("Peer is unknown")]
UnknownPeer,
#[error("Collation was already requested")]
AlreadyRequested,
#[error("Relay parent went out of view")]
RelayParentOutOfView,
#[error("Peer's protocol doesn't match the advertisement")]
ProtocolMismatch,
}
/// Represents a `RelayParentHoldOffState` error
#[derive(Debug, Error)]
pub enum HoldOffError {
#[error("`on_hold_off_complete` called in `NotStarted`")]
InvalidStateNotStarted,
#[error("`on_hold_off_complete` called in `Done`")]
InvalidStateDone,
#[error("`on_hold_off_complete` called in the right state but there are no advertisements in the queue")]
QueueEmpty,
}
@@ -0,0 +1,142 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use pezkuwi_node_subsystem_util::metrics::{self, prometheus};
#[derive(Clone, Default)]
pub struct Metrics(Option<MetricsInner>);
impl Metrics {
pub fn on_request(&self, succeeded: std::result::Result<(), ()>) {
if let Some(metrics) = &self.0 {
match succeeded {
Ok(()) => metrics.collation_requests.with_label_values(&["succeeded"]).inc(),
Err(()) => metrics.collation_requests.with_label_values(&["failed"]).inc(),
}
}
}
/// Provide a timer for `process_msg` which observes on drop.
pub fn time_process_msg(&self) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.process_msg.start_timer())
}
/// Provide a timer for `handle_collation_request_result` which observes on drop.
pub fn time_handle_collation_request_result(
&self,
) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0
.as_ref()
.map(|metrics| metrics.handle_collation_request_result.start_timer())
}
/// Note the current number of collator peers.
pub fn note_collator_peer_count(&self, collator_peers: usize) {
self.0
.as_ref()
.map(|metrics| metrics.collator_peer_count.set(collator_peers as u64));
}
/// Provide a timer for `CollationFetchRequest` structure which observes on drop.
pub fn time_collation_request_duration(
&self,
) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.collation_request_duration.start_timer())
}
/// Provide a timer for `request_unblocked_collations` which observes on drop.
pub fn time_request_unblocked_collations(
&self,
) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0
.as_ref()
.map(|metrics| metrics.request_unblocked_collations.start_timer())
}
}
#[derive(Clone)]
struct MetricsInner {
collation_requests: prometheus::CounterVec<prometheus::U64>,
process_msg: prometheus::Histogram,
handle_collation_request_result: prometheus::Histogram,
collator_peer_count: prometheus::Gauge<prometheus::U64>,
collation_request_duration: prometheus::Histogram,
request_unblocked_collations: prometheus::Histogram,
}
impl metrics::Metrics for Metrics {
fn try_register(
registry: &prometheus::Registry,
) -> std::result::Result<Self, prometheus::PrometheusError> {
let metrics = MetricsInner {
collation_requests: prometheus::register(
prometheus::CounterVec::new(
prometheus::Opts::new(
"pezkuwi_teyrchain_collation_requests_total",
"Number of collations requested from Collators.",
),
&["success"],
)?,
registry,
)?,
process_msg: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"pezkuwi_teyrchain_collator_protocol_validator_process_msg",
"Time spent within `collator_protocol_validator::process_msg`",
)
)?,
registry,
)?,
handle_collation_request_result: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"pezkuwi_teyrchain_collator_protocol_validator_handle_collation_request_result",
"Time spent within `collator_protocol_validator::handle_collation_request_result`",
)
)?,
registry,
)?,
collator_peer_count: prometheus::register(
prometheus::Gauge::new(
"pezkuwi_teyrchain_collator_peer_count",
"Amount of collator peers connected",
)?,
registry,
)?,
collation_request_duration: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"pezkuwi_teyrchain_collator_protocol_validator_collation_request_duration",
"Lifetime of the `CollationFetchRequest` structure",
).buckets(vec![0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.75, 0.9, 1.0, 1.2, 1.5, 1.75]),
)?,
registry,
)?,
request_unblocked_collations: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"pezkuwi_teyrchain_collator_protocol_validator_request_unblocked_collations",
"Time spent within `collator_protocol_validator::request_unblocked_collations`",
)
)?,
registry,
)?,
};
Ok(Metrics(Some(metrics)))
}
}
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,159 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use std::num::NonZeroU16;
use pezkuwi_node_network_protocol::peer_set::CollationVersion;
use pezkuwi_primitives::Id as ParaId;
/// Maximum reputation score.
pub const MAX_SCORE: u16 = 5000;
/// Limit for the total number connected peers.
pub const CONNECTED_PEERS_LIMIT: NonZeroU16 = NonZeroU16::new(300).expect("300 is greater than 0");
/// Limit for the total number of connected peers for a paraid.
/// Must be smaller than `CONNECTED_PEERS_LIMIT`.
pub const CONNECTED_PEERS_PARA_LIMIT: NonZeroU16 = const {
assert!(CONNECTED_PEERS_LIMIT.get() >= 100);
NonZeroU16::new(100).expect("100 is greater than 0")
};
/// Maximum number of relay parents to process for reputation bumps on startup and between finality
/// notifications.
pub const MAX_STARTUP_ANCESTRY_LOOKBACK: u32 = 20;
/// Reputation bump for getting a valid candidate included.
pub const VALID_INCLUDED_CANDIDATE_BUMP: u16 = 50;
/// Reputation slash for peer inactivity (for each included candidate of the para that was not
/// authored by the peer)
pub const INACTIVITY_DECAY: u16 = 1;
/// Maximum number of stored peer scores for a paraid. Should be greater than
/// `CONNECTED_PEERS_PARA_LIMIT`.
pub const MAX_STORED_SCORES_PER_PARA: u8 = 150;
/// Reputation score type.
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy, Default)]
pub struct Score(u16);
impl Score {
/// Create a new instance. Fail if over the `MAX_SCORE`.
pub const fn new(val: u16) -> Option<Self> {
if val > MAX_SCORE {
None
} else {
Some(Self(val))
}
}
/// Add `val` to the inner value, saturating at `MAX_SCORE`.
pub fn saturating_add(&mut self, val: u16) {
if (self.0 + val) <= MAX_SCORE {
self.0 += val;
} else {
self.0 = MAX_SCORE;
}
}
/// Subtract `val` from the inner value, saturating at 0.
pub fn saturating_sub(&mut self, val: u16) {
self.0 = self.0.saturating_sub(val);
}
}
impl From<Score> for u16 {
fn from(value: Score) -> Self {
value.0
}
}
/// Information about a connected peer.
#[derive(PartialEq, Debug, Clone)]
pub struct PeerInfo {
/// Protocol version.
pub version: CollationVersion,
/// State of the peer.
pub state: PeerState,
}
/// State of a connected peer
#[derive(PartialEq, Debug, Clone)]
pub enum PeerState {
/// Connected.
Connected,
/// Peer has declared.
Collating(ParaId),
}
#[cfg(test)]
mod tests {
use super::*;
// Test that the `Score` functions are working correctly.
#[test]
fn score_functions() {
assert!(MAX_SCORE > 50);
// Test that the constructor returns None for values that exceed the limit.
for score in (0..MAX_SCORE).step_by(10) {
assert_eq!(u16::from(Score::new(score).unwrap()), score);
}
assert_eq!(u16::from(Score::new(MAX_SCORE).unwrap()), MAX_SCORE);
for score in ((MAX_SCORE + 1)..(MAX_SCORE + 50)).step_by(5) {
assert_eq!(Score::new(score), None);
}
// Test saturating arithmetic functions.
let score = Score::new(50).unwrap();
// Test addition with value that does not go over the limit.
for other_score in (0..(MAX_SCORE - 50)).step_by(10) {
let expected_value = u16::from(score) + other_score;
let mut score = score;
score.saturating_add(other_score);
assert_eq!(expected_value, u16::from(score));
}
// Test overflowing addition.
for other_score in ((MAX_SCORE - 50)..MAX_SCORE).step_by(10) {
let mut score = score;
score.saturating_add(other_score);
assert_eq!(MAX_SCORE, u16::from(score));
}
// Test subtraction with value that does not go under zero.
for other_score in (0..50).step_by(10) {
let expected_value = u16::from(score) - other_score;
let mut score = score;
score.saturating_sub(other_score);
assert_eq!(expected_value, u16::from(score));
}
// Test underflowing subtraction.
for other_score in (50..100).step_by(10) {
let mut score = score;
score.saturating_sub(other_score);
assert_eq!(0, u16::from(score));
}
}
}
@@ -0,0 +1,68 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use crate::LOG_TARGET;
use fatality::Nested;
use pezkuwi_node_subsystem::{ChainApiError, SubsystemError};
use pezkuwi_node_subsystem_util::runtime;
use pezkuwi_primitives::Hash;
pub type Result<T> = std::result::Result<T, Error>;
pub type FatalResult<T> = std::result::Result<T, FatalError>;
#[fatality::fatality(splitable)]
pub enum Error {
#[fatal]
#[error("Oneshot for receiving ancestors from chain API got cancelled")]
CanceledAncestors,
#[fatal]
#[error("Oneshot for receiving finalized block number from chain API got cancelled")]
CanceledFinalizedBlockNumber,
#[fatal]
#[error("Oneshot for receiving finalized block hash from chain API got cancelled")]
CanceledFinalizedBlockHash,
#[error("Finalized block hash for {0} not found")]
FinalizedBlockNotFound(u32),
#[error(transparent)]
ChainApi(#[from] ChainApiError),
#[fatal(forward)]
#[error("Error while accessing runtime information {0}")]
Runtime(#[from] runtime::Error),
#[fatal]
#[error("Receiving message from overseer failed: {0}")]
SubsystemReceive(#[source] SubsystemError),
}
/// Utility for eating top level errors and log them.
///
/// We basically always want to try and continue on error. This utility function is meant to
/// consume top-level errors by simply logging them
pub fn log_error(result: Result<()>) -> FatalResult<()> {
match result.into_nested()? {
Ok(()) => Ok(()),
Err(jfyi) => {
jfyi.log();
Ok(())
},
}
}
impl JfyiError {
/// Log a `JfyiError`.
pub fn log(self) {
gum::warn!(target: LOG_TARGET, error = ?self);
}
}
@@ -0,0 +1,28 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use pezkuwi_node_subsystem_util::metrics::{self, prometheus};
#[derive(Clone, Default)]
pub struct Metrics;
impl metrics::Metrics for Metrics {
fn try_register(
_registry: &prometheus::Registry,
) -> std::result::Result<Self, prometheus::PrometheusError> {
Ok(Metrics)
}
}
@@ -0,0 +1,144 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
#![allow(unused)]
// See reasoning in Cargo.toml why this temporary useless import is needed.
use tokio as _;
mod common;
mod error;
mod metrics;
mod peer_manager;
mod state;
use std::collections::VecDeque;
use common::MAX_STORED_SCORES_PER_PARA;
use error::{log_error, FatalError, FatalResult, Result};
use fatality::Split;
use peer_manager::{Db, PeerManager};
use pezkuwi_node_subsystem::{
overseer, ActivatedLeaf, CollatorProtocolSenderTrait, FromOrchestra, OverseerSignal,
};
use pezkuwi_node_subsystem_util::{
find_validator_group, request_claim_queue, request_validator_groups, request_validators,
runtime::recv_runtime, signing_key_and_index,
};
use pezkuwi_primitives::{Hash, Id as ParaId};
use sp_keystore::KeystorePtr;
use state::State;
pub use metrics::Metrics;
use crate::LOG_TARGET;
/// The main run loop.
#[overseer::contextbounds(CollatorProtocol, prefix = self::overseer)]
pub(crate) async fn run<Context>(
mut ctx: Context,
keystore: KeystorePtr,
metrics: Metrics,
) -> FatalResult<()> {
if let Some(_state) = initialize(&mut ctx, keystore, metrics).await? {
// run_inner(state);
}
Ok(())
}
#[overseer::contextbounds(CollatorProtocol, prefix = self::overseer)]
async fn initialize<Context>(
ctx: &mut Context,
keystore: KeystorePtr,
metrics: Metrics,
) -> FatalResult<Option<State<Db>>> {
loop {
let first_leaf = match wait_for_first_leaf(ctx).await? {
Some(activated_leaf) => activated_leaf,
None => return Ok(None),
};
let scheduled_paras = match scheduled_paras(ctx.sender(), first_leaf.hash, &keystore).await
{
Ok(paras) => paras,
Err(err) => {
log_error(Err(err))?;
continue;
},
};
let backend = Db::new(MAX_STORED_SCORES_PER_PARA).await;
match PeerManager::startup(backend, ctx.sender(), scheduled_paras.into_iter().collect())
.await
{
Ok(peer_manager) => return Ok(Some(State::new(peer_manager, keystore, metrics))),
Err(err) => {
log_error(Err(err))?;
continue;
},
}
}
}
/// Wait for `ActiveLeavesUpdate`, returns `None` if `Conclude` signal came first.
#[overseer::contextbounds(CollatorProtocol, prefix = self::overseer)]
async fn wait_for_first_leaf<Context>(ctx: &mut Context) -> FatalResult<Option<ActivatedLeaf>> {
loop {
match ctx.recv().await.map_err(FatalError::SubsystemReceive)? {
FromOrchestra::Signal(OverseerSignal::Conclude) => return Ok(None),
FromOrchestra::Signal(OverseerSignal::ActiveLeaves(update)) => {
if let Some(activated) = update.activated {
return Ok(Some(activated));
}
},
FromOrchestra::Signal(OverseerSignal::BlockFinalized(_, _)) => {},
FromOrchestra::Communication { msg } => {
// TODO: we should actually disconnect peers connected on collation protocol while
// we're still bootstrapping. OR buffer these messages until we've bootstrapped.
gum::warn!(
target: LOG_TARGET,
?msg,
"Received msg before first active leaves update. This is not expected - message will be dropped."
)
},
}
}
}
async fn scheduled_paras<Sender: CollatorProtocolSenderTrait>(
sender: &mut Sender,
hash: Hash,
keystore: &KeystorePtr,
) -> Result<VecDeque<ParaId>> {
let validators = recv_runtime(request_validators(hash, sender).await).await?;
let (groups, rotation_info) =
recv_runtime(request_validator_groups(hash, sender).await).await?;
let core_now = if let Some(group) = signing_key_and_index(&validators, keystore)
.and_then(|(_, index)| find_validator_group(&groups, index))
{
rotation_info.core_for_group(group, groups.len())
} else {
gum::trace!(target: LOG_TARGET, ?hash, "Not a validator");
return Ok(VecDeque::new());
};
let mut claim_queue = recv_runtime(request_claim_queue(hash, sender).await).await?;
Ok(claim_queue.remove(&core_now).unwrap_or_else(|| VecDeque::new()))
}
@@ -0,0 +1,46 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use crate::validator_side_experimental::{common::Score, peer_manager::ReputationUpdate};
use async_trait::async_trait;
use pezkuwi_node_network_protocol::PeerId;
use pezkuwi_primitives::{BlockNumber, Id as ParaId};
use std::collections::{BTreeMap, BTreeSet, HashMap};
/// Trait describing the interface of the reputation database.
#[async_trait]
pub trait Backend {
/// Return the latest finalized block for which the backend processed bumps.
async fn processed_finalized_block_number(&self) -> Option<BlockNumber>;
/// Get the peer's stored reputation for this paraid, if any.
async fn query(&self, peer_id: &PeerId, para_id: &ParaId) -> Option<Score>;
/// Slash the peer's reputation for this paraid, with the given value.
async fn slash(&mut self, peer_id: &PeerId, para_id: &ParaId, value: Score);
/// Prune all data for paraids that are no longer in this registered set.
async fn prune_paras(&mut self, registered_paras: BTreeSet<ParaId>);
/// Process the reputation bumps, returning all the reputation changes that were done in
/// consequence. This is needed because a reputation bump for a para also means a reputation
/// decay for the other collators of that para (if the `decay_value` param is present) and
/// because if the number of stored reputations go over the `stored_limit_per_para`, we'll 100%
/// slash the least recently bumped peers. `leaf_number` needs to be at least equal to the
/// `processed_finalized_block_number`
async fn process_bumps(
&mut self,
leaf_number: BlockNumber,
bumps: BTreeMap<ParaId, HashMap<PeerId, Score>>,
decay_value: Option<Score>,
) -> Vec<ReputationUpdate>;
}
@@ -0,0 +1,765 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use crate::validator_side_experimental::{
common::Score,
peer_manager::{backend::Backend, ReputationUpdate, ReputationUpdateKind},
};
use async_trait::async_trait;
use pezkuwi_node_network_protocol::PeerId;
use pezkuwi_primitives::{BlockNumber, Hash, Id as ParaId};
use std::{
collections::{btree_map, hash_map, BTreeMap, BTreeSet, HashMap},
time::{SystemTime, UNIX_EPOCH},
};
/// This is an in-memory temporary implementation for the DB, to be used only for prototyping and
/// testing purposes.
pub struct Db {
db: BTreeMap<ParaId, HashMap<PeerId, ScoreEntry>>,
last_finalized: Option<BlockNumber>,
stored_limit_per_para: u8,
}
impl Db {
/// Create a new instance of the in-memory DB.
///
/// `stored_limit_per_para` is the maximum number of reputations that can be stored per para.
pub async fn new(stored_limit_per_para: u8) -> Self {
Self { db: BTreeMap::new(), last_finalized: None, stored_limit_per_para }
}
}
type Timestamp = u128;
#[derive(Clone, Debug)]
struct ScoreEntry {
score: Score,
last_bumped: Timestamp,
}
#[async_trait]
impl Backend for Db {
async fn processed_finalized_block_number(&self) -> Option<BlockNumber> {
self.last_finalized
}
async fn query(&self, peer_id: &PeerId, para_id: &ParaId) -> Option<Score> {
self.db.get(para_id).and_then(|per_para| per_para.get(peer_id).map(|e| e.score))
}
async fn slash(&mut self, peer_id: &PeerId, para_id: &ParaId, value: Score) {
if let btree_map::Entry::Occupied(mut per_para_entry) = self.db.entry(*para_id) {
if let hash_map::Entry::Occupied(mut e) = per_para_entry.get_mut().entry(*peer_id) {
let score = e.get_mut().score;
// Remove the entry if it goes to zero.
if score <= value {
e.remove();
} else {
e.get_mut().score.saturating_sub(value.into());
}
}
// If the per_para length went to 0, remove it completely
if per_para_entry.get().is_empty() {
per_para_entry.remove();
}
}
}
async fn prune_paras(&mut self, registered_paras: BTreeSet<ParaId>) {
self.db.retain(|para, _| registered_paras.contains(&para));
}
async fn process_bumps(
&mut self,
leaf_number: BlockNumber,
bumps: BTreeMap<ParaId, HashMap<PeerId, Score>>,
decay_value: Option<Score>,
) -> Vec<ReputationUpdate> {
if self.last_finalized.unwrap_or(0) >= leaf_number {
return vec![];
}
self.last_finalized = Some(leaf_number);
self.bump_reputations(bumps, decay_value)
}
}
impl Db {
fn bump_reputations(
&mut self,
bumps: BTreeMap<ParaId, HashMap<PeerId, Score>>,
maybe_decay_value: Option<Score>,
) -> Vec<ReputationUpdate> {
let mut reported_updates = vec![];
let now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_millis();
for (para, bumps_per_para) in bumps {
reported_updates.reserve(bumps_per_para.len());
for (peer_id, bump) in bumps_per_para.iter() {
if u16::from(*bump) == 0 {
continue;
}
self.db
.entry(para)
.or_default()
.entry(*peer_id)
.and_modify(|e| {
e.score.saturating_add(u16::from(*bump));
e.last_bumped = now;
})
.or_insert(ScoreEntry { score: *bump, last_bumped: now });
reported_updates.push(ReputationUpdate {
peer_id: *peer_id,
para_id: para,
value: *bump,
kind: ReputationUpdateKind::Bump,
});
}
if let btree_map::Entry::Occupied(mut per_para_entry) = self.db.entry(para) {
if let Some(decay_value) = maybe_decay_value {
let peers_to_slash = per_para_entry
.get()
.keys()
.filter(|peer_id| !bumps_per_para.contains_key(peer_id))
.copied()
.collect::<Vec<PeerId>>();
for peer_id in peers_to_slash {
if let hash_map::Entry::Occupied(mut e) =
per_para_entry.get_mut().entry(peer_id)
{
// Remove the entry if it goes to zero.
if e.get_mut().score <= decay_value {
let score = e.remove().score;
reported_updates.push(ReputationUpdate {
peer_id,
para_id: para,
value: score,
kind: ReputationUpdateKind::Slash,
});
} else {
e.get_mut().score.saturating_sub(decay_value.into());
reported_updates.push(ReputationUpdate {
peer_id,
para_id: para,
value: decay_value,
kind: ReputationUpdateKind::Slash,
});
}
}
}
}
let per_para_limit = self.stored_limit_per_para as usize;
if per_para_entry.get().is_empty() {
// If the per_para length went to 0, remove it completely
per_para_entry.remove();
} else if per_para_entry.get().len() > per_para_limit {
// We have exceeded the maximum capacity, in which case we need to prune
// the least recently bumped values
let diff = per_para_entry.get().len() - per_para_limit;
Self::prune_for_para(&para, &mut per_para_entry, diff, &mut reported_updates);
}
}
}
reported_updates
}
fn prune_for_para(
para_id: &ParaId,
per_para: &mut btree_map::OccupiedEntry<ParaId, HashMap<PeerId, ScoreEntry>>,
diff: usize,
reported_updates: &mut Vec<ReputationUpdate>,
) {
for _ in 0..diff {
let (peer_id_to_remove, score) = per_para
.get()
.iter()
.min_by_key(|(_peer, entry)| entry.last_bumped)
.map(|(peer, entry)| (*peer, entry.score))
.expect("We know there are enough reps over the limit");
per_para.get_mut().remove(&peer_id_to_remove);
reported_updates.push(ReputationUpdate {
peer_id: peer_id_to_remove,
para_id: *para_id,
value: score,
kind: ReputationUpdateKind::Slash,
});
}
}
#[cfg(test)]
fn len(&self) -> usize {
self.db.len()
}
}
#[cfg(test)]
mod tests {
use std::time::Duration;
use super::*;
#[tokio::test]
// Test different types of reputation updates and their effects.
async fn test_reputation_updates() {
let mut db = Db::new(10).await;
assert_eq!(db.processed_finalized_block_number().await, None);
assert_eq!(db.len(), 0);
// Test empty update with no decay.
assert!(db.process_bumps(10, Default::default(), None).await.is_empty());
assert_eq!(db.processed_finalized_block_number().await, Some(10));
assert_eq!(db.len(), 0);
// Test a query on a non-existant entry.
assert_eq!(db.query(&PeerId::random(), &ParaId::from(1000)).await, None);
// Test empty update with decay.
assert!(db
.process_bumps(11, Default::default(), Some(Score::new(1).unwrap()))
.await
.is_empty());
assert_eq!(db.processed_finalized_block_number().await, Some(11));
assert_eq!(db.len(), 0);
// Test empty update with a leaf number smaller than the latest one.
assert!(db
.process_bumps(5, Default::default(), Some(Score::new(1).unwrap()))
.await
.is_empty());
assert_eq!(db.processed_finalized_block_number().await, Some(11));
assert_eq!(db.len(), 0);
// Test an update with zeroed score.
assert!(db
.process_bumps(
12,
[(
ParaId::from(100),
[(PeerId::random(), Score::new(0).unwrap())].into_iter().collect()
)]
.into_iter()
.collect(),
Some(Score::new(1).unwrap())
)
.await
.is_empty());
assert_eq!(db.processed_finalized_block_number().await, Some(12));
assert_eq!(db.len(), 0);
// Reuse the same 12 block height, it should not be taken into consideration.
let first_peer_id = PeerId::random();
let first_para_id = ParaId::from(100);
assert!(db
.process_bumps(
12,
[(first_para_id, [(first_peer_id, Score::new(10).unwrap())].into_iter().collect())]
.into_iter()
.collect(),
Some(Score::new(1).unwrap())
)
.await
.is_empty());
assert_eq!(db.processed_finalized_block_number().await, Some(12));
assert_eq!(db.len(), 0);
assert_eq!(db.query(&first_peer_id, &first_para_id).await, None);
// Test a non-zero update on an empty DB.
assert_eq!(
db.process_bumps(
13,
[(first_para_id, [(first_peer_id, Score::new(10).unwrap())].into_iter().collect())]
.into_iter()
.collect(),
Some(Score::new(1).unwrap())
)
.await,
vec![ReputationUpdate {
peer_id: first_peer_id,
para_id: first_para_id,
kind: ReputationUpdateKind::Bump,
value: Score::new(10).unwrap()
}]
);
assert_eq!(db.processed_finalized_block_number().await, Some(13));
assert_eq!(db.len(), 1);
assert_eq!(
db.query(&first_peer_id, &first_para_id).await.unwrap(),
Score::new(10).unwrap()
);
// Query a non-existant peer_id for this para.
assert_eq!(db.query(&PeerId::random(), &first_para_id).await, None);
// Query this peer's rep for a different para.
assert_eq!(db.query(&first_peer_id, &ParaId::from(200)).await, None);
// Test a subsequent update with a lower block height. Will be ignored.
assert!(db
.process_bumps(
10,
[(first_para_id, [(first_peer_id, Score::new(10).unwrap())].into_iter().collect())]
.into_iter()
.collect(),
Some(Score::new(1).unwrap())
)
.await
.is_empty());
assert_eq!(db.processed_finalized_block_number().await, Some(13));
assert_eq!(db.len(), 1);
assert_eq!(
db.query(&first_peer_id, &first_para_id).await.unwrap(),
Score::new(10).unwrap()
);
let second_para_id = ParaId::from(200);
let second_peer_id = PeerId::random();
// Test a subsequent update with no decay.
assert_eq!(
db.process_bumps(
14,
[
(
first_para_id,
[(second_peer_id, Score::new(10).unwrap())].into_iter().collect()
),
(
second_para_id,
[(first_peer_id, Score::new(5).unwrap())].into_iter().collect()
)
]
.into_iter()
.collect(),
None
)
.await,
vec![
ReputationUpdate {
peer_id: second_peer_id,
para_id: first_para_id,
kind: ReputationUpdateKind::Bump,
value: Score::new(10).unwrap()
},
ReputationUpdate {
peer_id: first_peer_id,
para_id: second_para_id,
kind: ReputationUpdateKind::Bump,
value: Score::new(5).unwrap()
}
]
);
assert_eq!(db.len(), 2);
assert_eq!(db.processed_finalized_block_number().await, Some(14));
assert_eq!(
db.query(&first_peer_id, &first_para_id).await.unwrap(),
Score::new(10).unwrap()
);
assert_eq!(
db.query(&second_peer_id, &first_para_id).await.unwrap(),
Score::new(10).unwrap()
);
assert_eq!(
db.query(&first_peer_id, &second_para_id).await.unwrap(),
Score::new(5).unwrap()
);
// Empty update with decay has no effect.
assert!(db
.process_bumps(15, Default::default(), Some(Score::new(1).unwrap()))
.await
.is_empty());
assert_eq!(db.processed_finalized_block_number().await, Some(15));
assert_eq!(db.len(), 2);
assert_eq!(
db.query(&first_peer_id, &first_para_id).await.unwrap(),
Score::new(10).unwrap()
);
assert_eq!(
db.query(&second_peer_id, &first_para_id).await.unwrap(),
Score::new(10).unwrap()
);
assert_eq!(
db.query(&first_peer_id, &second_para_id).await.unwrap(),
Score::new(5).unwrap()
);
// Test a subsequent update with decay.
assert_eq!(
db.process_bumps(
16,
[
(
first_para_id,
[(first_peer_id, Score::new(10).unwrap())].into_iter().collect()
),
(
second_para_id,
[(second_peer_id, Score::new(10).unwrap())].into_iter().collect()
),
]
.into_iter()
.collect(),
Some(Score::new(1).unwrap())
)
.await,
vec![
ReputationUpdate {
peer_id: first_peer_id,
para_id: first_para_id,
kind: ReputationUpdateKind::Bump,
value: Score::new(10).unwrap()
},
ReputationUpdate {
peer_id: second_peer_id,
para_id: first_para_id,
kind: ReputationUpdateKind::Slash,
value: Score::new(1).unwrap()
},
ReputationUpdate {
peer_id: second_peer_id,
para_id: second_para_id,
kind: ReputationUpdateKind::Bump,
value: Score::new(10).unwrap()
},
ReputationUpdate {
peer_id: first_peer_id,
para_id: second_para_id,
kind: ReputationUpdateKind::Slash,
value: Score::new(1).unwrap()
},
]
);
assert_eq!(db.processed_finalized_block_number().await, Some(16));
assert_eq!(db.len(), 2);
assert_eq!(
db.query(&first_peer_id, &first_para_id).await.unwrap(),
Score::new(20).unwrap()
);
assert_eq!(
db.query(&second_peer_id, &first_para_id).await.unwrap(),
Score::new(9).unwrap()
);
assert_eq!(
db.query(&first_peer_id, &second_para_id).await.unwrap(),
Score::new(4).unwrap()
);
assert_eq!(
db.query(&second_peer_id, &second_para_id).await.unwrap(),
Score::new(10).unwrap()
);
// Test a decay that makes the reputation go to 0 (The peer's entry will be removed)
assert_eq!(
db.process_bumps(
17,
[(
second_para_id,
[(second_peer_id, Score::new(10).unwrap())].into_iter().collect()
),]
.into_iter()
.collect(),
Some(Score::new(5).unwrap())
)
.await,
vec![
ReputationUpdate {
peer_id: second_peer_id,
para_id: second_para_id,
kind: ReputationUpdateKind::Bump,
value: Score::new(10).unwrap()
},
ReputationUpdate {
peer_id: first_peer_id,
para_id: second_para_id,
kind: ReputationUpdateKind::Slash,
value: Score::new(4).unwrap()
}
]
);
assert_eq!(db.processed_finalized_block_number().await, Some(17));
assert_eq!(db.len(), 2);
assert_eq!(
db.query(&first_peer_id, &first_para_id).await.unwrap(),
Score::new(20).unwrap()
);
assert_eq!(
db.query(&second_peer_id, &first_para_id).await.unwrap(),
Score::new(9).unwrap()
);
assert_eq!(db.query(&first_peer_id, &second_para_id).await, None);
assert_eq!(
db.query(&second_peer_id, &second_para_id).await.unwrap(),
Score::new(20).unwrap()
);
// Test an update which ends up pruning least recently used entries. The per-para limit is
// 10.
let mut db = Db::new(10).await;
let peer_ids = (0..10).map(|_| PeerId::random()).collect::<Vec<_>>();
// Add an equal reputation for all peers.
assert_eq!(
db.process_bumps(
1,
[(
first_para_id,
peer_ids.iter().map(|peer_id| (*peer_id, Score::new(10).unwrap())).collect()
)]
.into_iter()
.collect(),
None,
)
.await
.len(),
10
);
assert_eq!(db.len(), 1);
for peer_id in peer_ids.iter() {
assert_eq!(db.query(peer_id, &first_para_id).await.unwrap(), Score::new(10).unwrap());
}
// Now sleep for one second and then bump the reputations of all peers except for the one
// with 4th index. We need to sleep so that the update time of the 4th peer is older than
// the rest.
tokio::time::sleep(Duration::from_millis(100)).await;
assert_eq!(
db.process_bumps(
2,
[(
first_para_id,
peer_ids
.iter()
.enumerate()
.filter_map(
|(i, peer_id)| (i != 4).then_some((*peer_id, Score::new(10).unwrap()))
)
.collect()
)]
.into_iter()
.collect(),
Some(Score::new(5).unwrap()),
)
.await
.len(),
10
);
for (i, peer_id) in peer_ids.iter().enumerate() {
if i == 4 {
assert_eq!(
db.query(peer_id, &first_para_id).await.unwrap(),
Score::new(5).unwrap()
);
} else {
assert_eq!(
db.query(peer_id, &first_para_id).await.unwrap(),
Score::new(20).unwrap()
);
}
}
// Now add a 11th peer. It should evict the 4th peer.
let new_peer = PeerId::random();
tokio::time::sleep(Duration::from_millis(100)).await;
assert_eq!(
db.process_bumps(
3,
[(first_para_id, [(new_peer, Score::new(10).unwrap())].into_iter().collect())]
.into_iter()
.collect(),
Some(Score::new(5).unwrap()),
)
.await
.len(),
11
);
for (i, peer_id) in peer_ids.iter().enumerate() {
if i == 4 {
assert_eq!(db.query(peer_id, &first_para_id).await, None);
} else {
assert_eq!(
db.query(peer_id, &first_para_id).await.unwrap(),
Score::new(15).unwrap()
);
}
}
assert_eq!(db.query(&new_peer, &first_para_id).await.unwrap(), Score::new(10).unwrap());
// Now try adding yet another peer. The decay would naturally evict the new peer so no need
// to evict the least recently bumped.
let yet_another_peer = PeerId::random();
assert_eq!(
db.process_bumps(
4,
[(
first_para_id,
[(yet_another_peer, Score::new(10).unwrap())].into_iter().collect()
)]
.into_iter()
.collect(),
Some(Score::new(10).unwrap()),
)
.await
.len(),
11
);
for (i, peer_id) in peer_ids.iter().enumerate() {
if i == 4 {
assert_eq!(db.query(peer_id, &first_para_id).await, None);
} else {
assert_eq!(
db.query(peer_id, &first_para_id).await.unwrap(),
Score::new(5).unwrap()
);
}
}
assert_eq!(db.query(&new_peer, &first_para_id).await, None);
assert_eq!(
db.query(&yet_another_peer, &first_para_id).await,
Some(Score::new(10).unwrap())
);
}
#[tokio::test]
// Test reputation slashes.
async fn test_slash() {
let mut db = Db::new(10).await;
// Test slash on empty DB
let peer_id = PeerId::random();
db.slash(&peer_id, &ParaId::from(100), Score::new(50).unwrap()).await;
assert_eq!(db.query(&peer_id, &ParaId::from(100)).await, None);
// Test slash on non-existent para
let another_peer_id = PeerId::random();
assert_eq!(
db.process_bumps(
1,
[
(ParaId::from(100), [(peer_id, Score::new(10).unwrap())].into_iter().collect()),
(
ParaId::from(200),
[(another_peer_id, Score::new(12).unwrap())].into_iter().collect()
),
(ParaId::from(300), [(peer_id, Score::new(15).unwrap())].into_iter().collect())
]
.into_iter()
.collect(),
Some(Score::new(10).unwrap()),
)
.await
.len(),
3
);
assert_eq!(db.query(&peer_id, &ParaId::from(100)).await.unwrap(), Score::new(10).unwrap());
assert_eq!(
db.query(&another_peer_id, &ParaId::from(200)).await.unwrap(),
Score::new(12).unwrap()
);
assert_eq!(db.query(&peer_id, &ParaId::from(300)).await.unwrap(), Score::new(15).unwrap());
db.slash(&peer_id, &ParaId::from(200), Score::new(4).unwrap()).await;
assert_eq!(db.query(&peer_id, &ParaId::from(100)).await.unwrap(), Score::new(10).unwrap());
assert_eq!(
db.query(&another_peer_id, &ParaId::from(200)).await.unwrap(),
Score::new(12).unwrap()
);
assert_eq!(db.query(&peer_id, &ParaId::from(300)).await.unwrap(), Score::new(15).unwrap());
// Test regular slash
db.slash(&peer_id, &ParaId::from(100), Score::new(4).unwrap()).await;
assert_eq!(db.query(&peer_id, &ParaId::from(100)).await.unwrap(), Score::new(6).unwrap());
// Test slash which removes the entry altogether
db.slash(&peer_id, &ParaId::from(100), Score::new(8).unwrap()).await;
assert_eq!(db.query(&peer_id, &ParaId::from(100)).await, None);
assert_eq!(db.len(), 2);
}
#[tokio::test]
// Test para pruning.
async fn test_prune_paras() {
let mut db = Db::new(10).await;
db.prune_paras(BTreeSet::new()).await;
assert_eq!(db.len(), 0);
db.prune_paras([ParaId::from(100), ParaId::from(200)].into_iter().collect())
.await;
assert_eq!(db.len(), 0);
let peer_id = PeerId::random();
let another_peer_id = PeerId::random();
assert_eq!(
db.process_bumps(
1,
[
(ParaId::from(100), [(peer_id, Score::new(10).unwrap())].into_iter().collect()),
(
ParaId::from(200),
[(another_peer_id, Score::new(12).unwrap())].into_iter().collect()
),
(ParaId::from(300), [(peer_id, Score::new(15).unwrap())].into_iter().collect())
]
.into_iter()
.collect(),
Some(Score::new(10).unwrap()),
)
.await
.len(),
3
);
assert_eq!(db.len(), 3);
// Registered paras include the existing ones. Does nothing
db.prune_paras(
[ParaId::from(100), ParaId::from(200), ParaId::from(300), ParaId::from(400)]
.into_iter()
.collect(),
)
.await;
assert_eq!(db.len(), 3);
assert_eq!(db.query(&peer_id, &ParaId::from(100)).await.unwrap(), Score::new(10).unwrap());
assert_eq!(
db.query(&another_peer_id, &ParaId::from(200)).await.unwrap(),
Score::new(12).unwrap()
);
assert_eq!(db.query(&peer_id, &ParaId::from(300)).await.unwrap(), Score::new(15).unwrap());
// Prunes multiple paras.
db.prune_paras([ParaId::from(300)].into_iter().collect()).await;
assert_eq!(db.len(), 1);
assert_eq!(db.query(&peer_id, &ParaId::from(100)).await, None);
assert_eq!(db.query(&another_peer_id, &ParaId::from(200)).await, None);
assert_eq!(db.query(&peer_id, &ParaId::from(300)).await.unwrap(), Score::new(15).unwrap());
// Prunes all paras.
db.prune_paras(BTreeSet::new()).await;
assert_eq!(db.len(), 0);
assert_eq!(db.query(&peer_id, &ParaId::from(300)).await, None);
}
}
@@ -0,0 +1,518 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
mod backend;
mod connected;
mod db;
use futures::channel::oneshot;
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
use crate::{
validator_side_experimental::{
common::{
PeerInfo, PeerState, Score, CONNECTED_PEERS_LIMIT, CONNECTED_PEERS_PARA_LIMIT,
INACTIVITY_DECAY, MAX_STARTUP_ANCESTRY_LOOKBACK, MAX_STORED_SCORES_PER_PARA,
VALID_INCLUDED_CANDIDATE_BUMP,
},
error::{Error, Result},
},
LOG_TARGET,
};
pub use backend::Backend;
use connected::ConnectedPeers;
pub use db::Db;
use pezkuwi_node_network_protocol::{
peer_set::{CollationVersion, PeerSet},
PeerId,
};
use pezkuwi_node_subsystem::{
messages::{ChainApiMessage, NetworkBridgeTxMessage},
ActivatedLeaf, CollatorProtocolSenderTrait,
};
use pezkuwi_node_subsystem_util::{
request_candidate_events, request_candidates_pending_availability, runtime::recv_runtime,
};
use pezkuwi_primitives::{
BlockNumber, CandidateDescriptorVersion, CandidateEvent, CandidateHash, Hash, Id as ParaId,
};
#[derive(Debug, PartialEq, Clone)]
pub struct ReputationUpdate {
pub peer_id: PeerId,
pub para_id: ParaId,
pub value: Score,
pub kind: ReputationUpdateKind,
}
#[derive(Debug, PartialEq, Clone)]
pub enum ReputationUpdateKind {
Bump,
Slash,
}
#[derive(Debug, PartialEq)]
enum TryAcceptOutcome {
Added,
// This can hold more than one `PeerId` because before receiving the `Declare` message,
// one peer can hold connection slots for multiple paraids.
// The set can also be empty if this peer replaced some other peer's slot but that other peer
// maintained a connection slot for another para (therefore not disconnected).
// The number of peers in the set is bound to the number of scheduled paras.
Replaced(HashSet<PeerId>),
Rejected,
}
impl TryAcceptOutcome {
fn combine(self, other: Self) -> Self {
use TryAcceptOutcome::*;
match (self, other) {
(Added, Added) => Added,
(Rejected, Rejected) => Rejected,
(Added, Rejected) | (Rejected, Added) => Added,
(Replaced(mut replaced_a), Replaced(replaced_b)) => {
replaced_a.extend(replaced_b);
Replaced(replaced_a)
},
(_, Replaced(replaced)) | (Replaced(replaced), _) => Replaced(replaced),
}
}
}
#[derive(Debug, PartialEq)]
enum DeclarationOutcome {
Rejected,
Switched(ParaId),
Accepted,
}
pub struct PeerManager<B> {
db: B,
connected: ConnectedPeers,
}
impl<B: Backend> PeerManager<B> {
/// Initialize the peer manager (called on subsystem startup, after the node finished syncing to
/// the tip of the chain).
pub async fn startup<Sender: CollatorProtocolSenderTrait>(
backend: B,
sender: &mut Sender,
scheduled_paras: BTreeSet<ParaId>,
) -> Result<Self> {
let mut instance = Self {
db: backend,
connected: ConnectedPeers::new(
scheduled_paras,
CONNECTED_PEERS_LIMIT,
CONNECTED_PEERS_PARA_LIMIT,
),
};
let (latest_finalized_block_number, latest_finalized_block_hash) =
get_latest_finalized_block(sender).await?;
let processed_finalized_block_number =
instance.db.processed_finalized_block_number().await.unwrap_or_default();
let bumps = extract_reputation_bumps_on_new_finalized_block(
sender,
processed_finalized_block_number,
(latest_finalized_block_number, latest_finalized_block_hash),
)
.await?;
instance.db.process_bumps(latest_finalized_block_number, bumps, None).await;
Ok(instance)
}
/// Handle a new block finality notification, by updating peer reputations.
pub async fn update_reputations_on_new_finalized_block<Sender: CollatorProtocolSenderTrait>(
&mut self,
sender: &mut Sender,
(finalized_block_hash, finalized_block_number): (Hash, BlockNumber),
) -> Result<()> {
let processed_finalized_block_number =
self.db.processed_finalized_block_number().await.unwrap_or_default();
let bumps = extract_reputation_bumps_on_new_finalized_block(
sender,
processed_finalized_block_number,
(finalized_block_number, finalized_block_hash),
)
.await?;
let updates = self
.db
.process_bumps(
finalized_block_number,
bumps,
Some(Score::new(INACTIVITY_DECAY).expect("INACTIVITY_DECAY is a valid score")),
)
.await;
for update in updates {
self.connected.update_reputation(update);
}
Ok(())
}
/// Process the registered paras and cleanup all data pertaining to any unregistered paras, if
/// any. Should be called every N finalized block notifications, since it's expected that para
/// deregistrations are rare.
pub async fn registered_paras_update(&mut self, registered_paras: BTreeSet<ParaId>) {
// Tell the DB to cleanup paras that are no longer registered. No need to clean up the
// connected peers state, since it will get automatically cleaned up as the claim queue
// gets rid of these stale assignments.
self.db.prune_paras(registered_paras).await;
}
/// Process a potential change of the scheduled paras.
pub async fn scheduled_paras_update<Sender: CollatorProtocolSenderTrait>(
&mut self,
sender: &mut Sender,
scheduled_paras: BTreeSet<ParaId>,
) {
let mut prev_scheduled_paras: BTreeSet<_> =
self.connected.scheduled_paras().copied().collect();
if prev_scheduled_paras == scheduled_paras {
// Nothing to do if the scheduled paras didn't change.
return;
}
// Recreate the connected peers based on the new schedule and try populating it again based
// on their reputations. Disconnect any peers that couldn't be kept
let mut new_instance =
ConnectedPeers::new(scheduled_paras, CONNECTED_PEERS_LIMIT, CONNECTED_PEERS_PARA_LIMIT);
std::mem::swap(&mut new_instance, &mut self.connected);
let prev_instance = new_instance;
let (prev_peers, cached_scores) = prev_instance.consume();
// Build a closure that can be used to first query the in-memory past reputations of the
// peers before reaching for the DB.
// Borrow these for use in the closure.
let cached_scores = &cached_scores;
let db = &self.db;
let reputation_query_fn = |peer_id: PeerId, para_id: ParaId| async move {
if let Some(cached_score) =
cached_scores.get(&para_id).and_then(|per_para| per_para.get_score(&peer_id))
{
cached_score
} else {
db.query(&peer_id, &para_id).await.unwrap_or_default()
}
};
// See which of the old peers we should keep.
let mut peers_to_disconnect = HashSet::new();
for (peer_id, peer_info) in prev_peers {
let outcome = self.connected.try_accept(reputation_query_fn, peer_id, peer_info).await;
match outcome {
TryAcceptOutcome::Rejected => {
peers_to_disconnect.insert(peer_id);
},
TryAcceptOutcome::Replaced(replaced_peer_ids) => {
peers_to_disconnect.extend(replaced_peer_ids);
},
TryAcceptOutcome::Added => {},
}
}
// Disconnect peers that couldn't be kept.
self.disconnect_peers(sender, peers_to_disconnect).await;
}
/// Process a declaration message of a peer.
pub async fn declared<Sender: CollatorProtocolSenderTrait>(
&mut self,
sender: &mut Sender,
peer_id: PeerId,
para_id: ParaId,
) {
let Some(peer_info) = self.connected.peer_info(&peer_id).cloned() else { return };
let outcome = self.connected.declared(peer_id, para_id);
match outcome {
DeclarationOutcome::Accepted => {
gum::debug!(
target: LOG_TARGET,
?para_id,
?peer_id,
"Peer declared",
);
},
DeclarationOutcome::Switched(old_para_id) => {
gum::debug!(
target: LOG_TARGET,
?para_id,
?old_para_id,
?peer_id,
"Peer switched collating paraid. Trying to accept it on the new one.",
);
self.try_accept_connection(sender, peer_id, peer_info).await;
},
DeclarationOutcome::Rejected => {
gum::debug!(
target: LOG_TARGET,
?para_id,
?peer_id,
"Peer declared but rejected. Going to disconnect.",
);
self.disconnect_peers(sender, [peer_id].into_iter().collect()).await;
},
}
}
/// Slash a peer's reputation for this paraid.
pub async fn slash_reputation(&mut self, peer_id: &PeerId, para_id: &ParaId, value: Score) {
gum::debug!(
target: LOG_TARGET,
?peer_id,
?para_id,
?value,
"Slashing peer's reputation",
);
self.db.slash(peer_id, para_id, value).await;
self.connected.update_reputation(ReputationUpdate {
peer_id: *peer_id,
para_id: *para_id,
value,
kind: ReputationUpdateKind::Slash,
});
}
/// Process a peer disconnected event coming from the network.
pub fn disconnected(&mut self, peer_id: &PeerId) {
self.connected.remove(peer_id);
}
/// A connection was made, triage it. Return whether or not is was kept.
pub async fn try_accept_connection<Sender: CollatorProtocolSenderTrait>(
&mut self,
sender: &mut Sender,
peer_id: PeerId,
peer_info: PeerInfo,
) -> bool {
let db = &self.db;
let reputation_query_fn = |peer_id: PeerId, para_id: ParaId| async move {
// Go straight to the DB. We only store in-memory the reputations of connected peers.
db.query(&peer_id, &para_id).await.unwrap_or_default()
};
let outcome = self.connected.try_accept(reputation_query_fn, peer_id, peer_info).await;
match outcome {
TryAcceptOutcome::Added => true,
TryAcceptOutcome::Replaced(other_peers) => {
gum::trace!(
target: LOG_TARGET,
"Peer {:?} replaced the connection slots of other peers: {:?}",
peer_id,
&other_peers
);
self.disconnect_peers(sender, other_peers).await;
true
},
TryAcceptOutcome::Rejected => {
gum::debug!(
target: LOG_TARGET,
?peer_id,
"Peer connection was rejected",
);
self.disconnect_peers(sender, [peer_id].into_iter().collect()).await;
false
},
}
}
/// Retrieve the score of the connected peer. We assume the peer is declared for this paraid.
pub fn connected_peer_score(&self, peer_id: &PeerId, para_id: &ParaId) -> Option<Score> {
self.connected.peer_score(peer_id, para_id)
}
async fn disconnect_peers<Sender: CollatorProtocolSenderTrait>(
&self,
sender: &mut Sender,
peers: HashSet<PeerId>,
) {
gum::trace!(
target: LOG_TARGET,
?peers,
"Disconnecting peers",
);
sender
.send_message(NetworkBridgeTxMessage::DisconnectPeers(
peers.into_iter().collect(),
PeerSet::Collation,
))
.await;
}
}
async fn get_ancestors<Sender: CollatorProtocolSenderTrait>(
sender: &mut Sender,
k: usize,
hash: Hash,
) -> Result<Vec<Hash>> {
let (tx, rx) = oneshot::channel();
sender
.send_message(ChainApiMessage::Ancestors { hash, k, response_channel: tx })
.await;
Ok(rx.await.map_err(|_| Error::CanceledAncestors)??)
}
async fn get_latest_finalized_block<Sender: CollatorProtocolSenderTrait>(
sender: &mut Sender,
) -> Result<(BlockNumber, Hash)> {
let (tx, rx) = oneshot::channel();
sender.send_message(ChainApiMessage::FinalizedBlockNumber(tx)).await;
let block_number = rx.await.map_err(|_| Error::CanceledFinalizedBlockNumber)??;
let (tx, rx) = oneshot::channel();
sender.send_message(ChainApiMessage::FinalizedBlockHash(block_number, tx)).await;
let block_hash = rx
.await
.map_err(|_| Error::CanceledFinalizedBlockHash)??
.ok_or_else(|| Error::FinalizedBlockNotFound(block_number))?;
Ok((block_number, block_hash))
}
async fn extract_reputation_bumps_on_new_finalized_block<Sender: CollatorProtocolSenderTrait>(
sender: &mut Sender,
processed_finalized_block_number: BlockNumber,
(latest_finalized_block_number, latest_finalized_block_hash): (BlockNumber, Hash),
) -> Result<BTreeMap<ParaId, HashMap<PeerId, Score>>> {
if latest_finalized_block_number < processed_finalized_block_number {
// Shouldn't be possible, but in this case there is no other initialisation needed.
gum::warn!(
target: LOG_TARGET,
latest_finalized_block_number,
?latest_finalized_block_hash,
"Peer manager stored finalized block number {} is higher than the latest finalized block.",
processed_finalized_block_number,
);
return Ok(BTreeMap::new());
}
let ancestry_len = std::cmp::min(
latest_finalized_block_number.saturating_sub(processed_finalized_block_number),
MAX_STARTUP_ANCESTRY_LOOKBACK,
);
if ancestry_len == 0 {
return Ok(BTreeMap::new());
}
let mut ancestors =
get_ancestors(sender, ancestry_len as usize, latest_finalized_block_hash).await?;
ancestors.push(latest_finalized_block_hash);
ancestors.reverse();
gum::trace!(
target: LOG_TARGET,
?latest_finalized_block_hash,
processed_finalized_block_number,
"Processing reputation bumps for finalized relay parent {} and its {} ancestors",
latest_finalized_block_number,
ancestry_len
);
let mut v2_candidates_per_rp: HashMap<Hash, BTreeMap<ParaId, HashSet<CandidateHash>>> =
HashMap::with_capacity(ancestors.len());
for i in 1..ancestors.len() {
let rp = ancestors[i];
let parent_rp = ancestors[i - 1];
let candidate_events = recv_runtime(request_candidate_events(rp, sender).await).await?;
for event in candidate_events {
if let CandidateEvent::CandidateIncluded(receipt, _, _, _) = event {
// Only v2 receipts can contain UMP signals.
if receipt.descriptor.version() == CandidateDescriptorVersion::V2 {
v2_candidates_per_rp
.entry(parent_rp)
.or_default()
.entry(receipt.descriptor.para_id())
.or_default()
.insert(receipt.hash());
}
}
}
}
// This could be removed if we implemented https://github.com/pezkuwichain/pezkuwi-sdk/issues/152.
let mut updates: BTreeMap<ParaId, HashMap<PeerId, Score>> = BTreeMap::new();
for (rp, per_para) in v2_candidates_per_rp {
for (para_id, included_candidates) in per_para {
let candidates_pending_availability =
recv_runtime(request_candidates_pending_availability(rp, para_id, sender).await)
.await?;
for candidate in candidates_pending_availability {
let candidate_hash = candidate.hash();
if included_candidates.contains(&candidate_hash) {
match candidate.commitments.ump_signals() {
Ok(ump_signals) => {
if let Some(approved_peer) = ump_signals.approved_peer() {
match PeerId::from_bytes(approved_peer) {
Ok(peer_id) => updates
.entry(para_id)
.or_default()
.entry(peer_id)
.or_default()
.saturating_add(VALID_INCLUDED_CANDIDATE_BUMP),
Err(err) => {
// Collator sent an invalid peerid. It's only harming
// itself.
gum::debug!(
target: LOG_TARGET,
?candidate_hash,
"UMP signal contains invalid ApprovedPeer id: {}",
err
);
},
}
}
},
Err(err) => {
// This should never happen, as the ump signals are checked during
// on-chain backing.
gum::warn!(
target: LOG_TARGET,
?candidate_hash,
"Failed to parse UMP signals for included candidate: {}",
err
);
},
}
}
}
}
}
Ok(updates)
}
@@ -0,0 +1,32 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use crate::validator_side_experimental::{peer_manager::Backend, Metrics, PeerManager};
use sp_keystore::KeystorePtr;
/// All state relevant for the validator side of the protocol lives here.
pub struct State<B> {
peer_manager: PeerManager<B>,
keystore: KeystorePtr,
metrics: Metrics,
}
impl<B: Backend> State<B> {
/// Instantiate a new subsystem `State`.
pub fn new(peer_manager: PeerManager<B>, keystore: KeystorePtr, metrics: Metrics) -> Self {
Self { peer_manager, keystore, metrics }
}
}
@@ -0,0 +1,53 @@
[package]
name = "pezkuwi-dispute-distribution"
version = "7.0.0"
description = "Pezkuwi Dispute Distribution subsystem, which ensures all concerned validators are aware of a dispute and have the relevant votes."
authors.workspace = true
edition.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
[lints]
workspace = true
[dependencies]
codec = { features = ["std"], workspace = true, default-features = true }
fatality = { workspace = true }
futures = { workspace = true }
futures-timer = { workspace = true }
gum = { workspace = true, default-features = true }
indexmap = { workspace = true }
pezkuwi-node-network-protocol = { workspace = true, default-features = true }
pezkuwi-node-primitives = { workspace = true, default-features = true }
pezkuwi-node-subsystem = { workspace = true, default-features = true }
pezkuwi-node-subsystem-util = { workspace = true, default-features = true }
pezkuwi-primitives = { workspace = true, default-features = true }
sc-network = { workspace = true, default-features = true }
sp-application-crypto = { workspace = true, default-features = true }
sp-keystore = { workspace = true, default-features = true }
thiserror = { workspace = true }
[dev-dependencies]
assert_matches = { workspace = true }
async-channel = { workspace = true }
async-trait = { workspace = true }
pezkuwi-node-subsystem-test-helpers = { workspace = true }
pezkuwi-primitives-test-helpers = { workspace = true }
sc-keystore = { workspace = true, default-features = true }
sp-keyring = { workspace = true, default-features = true }
sp-tracing = { workspace = true, default-features = true }
[features]
runtime-benchmarks = [
"gum/runtime-benchmarks",
"pezkuwi-node-network-protocol/runtime-benchmarks",
"pezkuwi-node-primitives/runtime-benchmarks",
"pezkuwi-node-subsystem-test-helpers/runtime-benchmarks",
"pezkuwi-node-subsystem-util/runtime-benchmarks",
"pezkuwi-node-subsystem/runtime-benchmarks",
"pezkuwi-primitives-test-helpers/runtime-benchmarks",
"pezkuwi-primitives/runtime-benchmarks",
"sc-network/runtime-benchmarks",
"sp-keyring/runtime-benchmarks",
]
@@ -0,0 +1,72 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//
//! Error handling related code and Error/Result definitions.
use pezkuwi_node_subsystem::SubsystemError;
use pezkuwi_node_subsystem_util::runtime;
use crate::{sender, LOG_TARGET};
use fatality::Nested;
#[allow(missing_docs)]
#[fatality::fatality(splitable)]
pub enum Error {
/// Receiving subsystem message from overseer failed.
#[fatal]
#[error("Receiving message from overseer failed")]
SubsystemReceive(#[source] SubsystemError),
/// Spawning a running task failed.
#[fatal]
#[error("Spawning subsystem task failed")]
SpawnTask(#[source] SubsystemError),
/// `DisputeSender` mpsc receiver exhausted.
#[fatal]
#[error("Erasure chunk requester stream exhausted")]
SenderExhausted,
/// Errors coming from `runtime::Runtime`.
#[fatal(forward)]
#[error("Error while accessing runtime information")]
Runtime(#[from] runtime::Error),
/// Errors coming from `DisputeSender`
#[fatal(forward)]
#[error("Error while accessing runtime information")]
Sender(#[from] sender::Error),
}
pub type Result<T> = std::result::Result<T, Error>;
pub type FatalResult<T> = std::result::Result<T, FatalError>;
/// Utility for eating top level errors and log them.
///
/// We basically always want to try and continue on error. This utility function is meant to
/// consume top-level errors by simply logging them
pub fn log_error(result: Result<()>, ctx: &'static str) -> std::result::Result<(), FatalError> {
match result.into_nested()? {
Err(jfyi) => {
gum::warn!(target: LOG_TARGET, error = ?jfyi, ctx);
Ok(())
},
Ok(()) => Ok(()),
}
}
@@ -0,0 +1,297 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//! # Sending and receiving of `DisputeRequest`s.
//!
//! This subsystem essentially consists of two parts:
//!
//! - a sender
//! - and a receiver
//!
//! The sender is responsible for getting our vote out, see `sender`. The receiver handles
//! incoming [`DisputeRequest`](v1::DisputeRequest)s and offers spam protection, see `receiver`.
use std::time::Duration;
use futures::{channel::mpsc, FutureExt, StreamExt, TryFutureExt};
use pezkuwi_node_network_protocol::authority_discovery::AuthorityDiscovery;
use pezkuwi_node_subsystem_util::nesting_sender::NestingSender;
use sp_keystore::KeystorePtr;
use pezkuwi_node_network_protocol::request_response::{incoming::IncomingRequestReceiver, v1};
use pezkuwi_node_primitives::DISPUTE_WINDOW;
use pezkuwi_node_subsystem::{
messages::DisputeDistributionMessage, overseer, FromOrchestra, OverseerSignal,
SpawnedSubsystem, SubsystemError,
};
use pezkuwi_node_subsystem_util::{runtime, runtime::RuntimeInfo};
/// ## The sender [`DisputeSender`]
///
/// The sender (`DisputeSender`) keeps track of live disputes and makes sure our vote gets out for
/// each one of those. The sender is responsible for sending our vote to each validator
/// participating in the dispute and to each authority currently authoring blocks. The sending can
/// be initiated by sending `DisputeDistributionMessage::SendDispute` message to this subsystem.
///
/// In addition the `DisputeSender` will query the coordinator for active disputes on each
/// [`DisputeSender::update_leaves`] call and will initiate sending (start a `SendTask`) for every,
/// to this subsystem, unknown dispute. This is to make sure, we get our vote out, even on
/// restarts.
///
/// The actual work of sending and keeping track of transmission attempts to each validator for a
/// particular dispute are done by [`SendTask`]. The purpose of the `DisputeSender` is to keep
/// track of all ongoing disputes and start and clean up `SendTask`s accordingly.
mod sender;
use self::sender::{DisputeSender, DisputeSenderMessage};
/// ## The receiver [`DisputesReceiver`]
///
/// The receiving side is implemented as `DisputesReceiver` and is run as a separate long running
/// task within this subsystem ([`DisputesReceiver::run`]).
///
/// Conceptually all the receiver has to do, is waiting for incoming requests which are passed in
/// via a dedicated channel and forwarding them to the dispute coordinator via
/// `DisputeCoordinatorMessage::ImportStatements`. Being the interface to the network and untrusted
/// nodes, the reality is not that simple of course. Before importing statements the receiver will
/// batch up imports as well as possible for efficient imports while maintaining timely dispute
/// resolution and handling of spamming validators:
///
/// - Drop all messages from non validator nodes, for this it requires the [`AuthorityDiscovery`]
/// service.
/// - Drop messages from a node, if it sends at a too high rate.
/// - Filter out duplicate messages (over some period of time).
/// - Drop any obviously invalid votes (invalid signatures for example).
/// - Ban peers whose votes were deemed invalid.
///
/// In general dispute-distribution works on limiting the work the dispute-coordinator will have to
/// do, while at the same time making it aware of new disputes as fast as possible.
///
/// For successfully imported votes, we will confirm the receipt of the message back to the sender.
/// This way a received confirmation guarantees, that the vote has been stored to disk by the
/// receiver.
mod receiver;
use self::receiver::DisputesReceiver;
/// Error and [`Result`] type for this subsystem.
mod error;
use error::{log_error, Error, FatalError, FatalResult, Result};
#[cfg(test)]
mod tests;
mod metrics;
//// Prometheus `Metrics` for dispute distribution.
pub use metrics::Metrics;
const LOG_TARGET: &'static str = "teyrchain::dispute-distribution";
/// Rate limit on the `receiver` side.
///
/// If messages from one peer come in at a higher rate than every `RECEIVE_RATE_LIMIT` on average,
/// we start dropping messages from that peer to enforce that limit.
pub const RECEIVE_RATE_LIMIT: Duration = Duration::from_millis(100);
/// Rate limit on the `sender` side.
///
/// In order to not hit the `RECEIVE_RATE_LIMIT` on the receiving side, we limit out sending rate as
/// well.
///
/// We add 50ms extra, just to have some save margin to the `RECEIVE_RATE_LIMIT`.
pub const SEND_RATE_LIMIT: Duration = RECEIVE_RATE_LIMIT.saturating_add(Duration::from_millis(50));
/// The dispute distribution subsystem.
pub struct DisputeDistributionSubsystem<AD> {
/// Easy and efficient runtime access for this subsystem.
runtime: RuntimeInfo,
/// Sender for our dispute requests.
disputes_sender: DisputeSender<DisputeSenderMessage>,
/// Receive messages from `DisputeSender` background tasks.
sender_rx: mpsc::Receiver<DisputeSenderMessage>,
/// Receiver for incoming requests.
req_receiver: Option<IncomingRequestReceiver<v1::DisputeRequest>>,
/// Authority discovery service.
authority_discovery: AD,
/// Metrics for this subsystem.
metrics: Metrics,
}
#[overseer::subsystem(DisputeDistribution, error = SubsystemError, prefix = self::overseer)]
impl<Context, AD> DisputeDistributionSubsystem<AD>
where
<Context as overseer::DisputeDistributionContextTrait>::Sender:
overseer::DisputeDistributionSenderTrait + Sync + Send,
AD: AuthorityDiscovery + Clone,
{
fn start(self, ctx: Context) -> SpawnedSubsystem {
let future = self
.run(ctx)
.map_err(|e| SubsystemError::with_origin("dispute-distribution", e))
.boxed();
SpawnedSubsystem { name: "dispute-distribution-subsystem", future }
}
}
#[overseer::contextbounds(DisputeDistribution, prefix = self::overseer)]
impl<AD> DisputeDistributionSubsystem<AD>
where
AD: AuthorityDiscovery + Clone,
{
/// Create a new instance of the dispute distribution.
pub fn new(
keystore: KeystorePtr,
req_receiver: IncomingRequestReceiver<v1::DisputeRequest>,
authority_discovery: AD,
metrics: Metrics,
) -> Self {
let runtime = RuntimeInfo::new_with_config(runtime::Config {
keystore: Some(keystore),
session_cache_lru_size: DISPUTE_WINDOW.get(),
});
let (tx, sender_rx) = NestingSender::new_root(1);
let disputes_sender = DisputeSender::new(tx, metrics.clone());
Self {
runtime,
disputes_sender,
sender_rx,
req_receiver: Some(req_receiver),
authority_discovery,
metrics,
}
}
/// Start processing work as passed on from the Overseer.
async fn run<Context>(mut self, mut ctx: Context) -> std::result::Result<(), FatalError> {
let receiver = DisputesReceiver::new(
ctx.sender().clone(),
self.req_receiver
.take()
.expect("Must be provided on `new` and we take ownership here. qed."),
self.authority_discovery.clone(),
self.metrics.clone(),
);
ctx.spawn("disputes-receiver", receiver.run().boxed())
.map_err(FatalError::SpawnTask)?;
// Process messages for sending side.
//
// Note: We want the sender to be rate limited and we are currently taking advantage of the
// fact that the root task of this subsystem is only concerned with sending: Functions of
// `DisputeSender` might back pressure if the rate limit is hit, which will slow down this
// loop. If this fact ever changes, we will likely need another task.
loop {
let message = MuxedMessage::receive(&mut ctx, &mut self.sender_rx).await;
match message {
MuxedMessage::Subsystem(result) => {
let result = match result? {
FromOrchestra::Signal(signal) => {
match self.handle_signals(&mut ctx, signal).await {
Ok(SignalResult::Conclude) => return Ok(()),
Ok(SignalResult::Continue) => Ok(()),
Err(f) => Err(f),
}
},
FromOrchestra::Communication { msg } =>
self.handle_subsystem_message(&mut ctx, msg).await,
};
log_error(result, "on FromOrchestra")?;
},
MuxedMessage::Sender(result) => {
let result = self
.disputes_sender
.on_message(
&mut ctx,
&mut self.runtime,
result.ok_or(FatalError::SenderExhausted)?,
)
.await
.map_err(Error::Sender);
log_error(result, "on_message")?;
},
}
}
}
/// Handle overseer signals.
async fn handle_signals<Context>(
&mut self,
ctx: &mut Context,
signal: OverseerSignal,
) -> Result<SignalResult> {
match signal {
OverseerSignal::Conclude => return Ok(SignalResult::Conclude),
OverseerSignal::ActiveLeaves(update) => {
self.disputes_sender.update_leaves(ctx, &mut self.runtime, update).await?;
},
OverseerSignal::BlockFinalized(_, _) => {},
};
Ok(SignalResult::Continue)
}
/// Handle `DisputeDistributionMessage`s.
async fn handle_subsystem_message<Context>(
&mut self,
ctx: &mut Context,
msg: DisputeDistributionMessage,
) -> Result<()> {
match msg {
DisputeDistributionMessage::SendDispute(dispute_msg) =>
self.disputes_sender.start_sender(ctx, &mut self.runtime, dispute_msg).await?,
}
Ok(())
}
}
/// Messages to be handled in this subsystem.
#[derive(Debug)]
enum MuxedMessage {
/// Messages from other subsystems.
Subsystem(FatalResult<FromOrchestra<DisputeDistributionMessage>>),
/// Messages from spawned sender background tasks.
Sender(Option<DisputeSenderMessage>),
}
#[overseer::contextbounds(DisputeDistribution, prefix = self::overseer)]
impl MuxedMessage {
async fn receive<Context>(
ctx: &mut Context,
from_sender: &mut mpsc::Receiver<DisputeSenderMessage>,
) -> Self {
// We are only fusing here to make `select` happy, in reality we will quit if the stream
// ends.
let from_overseer = ctx.recv().fuse();
futures::pin_mut!(from_overseer, from_sender);
// We select biased to make sure we finish up loose ends, before starting new work.
futures::select_biased!(
msg = from_sender.next() => MuxedMessage::Sender(msg),
msg = from_overseer => MuxedMessage::Subsystem(msg.map_err(FatalError::SubsystemReceive)),
)
}
}
/// Result of handling signal from overseer.
enum SignalResult {
/// Overseer asked us to conclude.
Conclude,
/// We can continue processing events.
Continue,
}
@@ -0,0 +1,130 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use pezkuwi_node_subsystem_util::{
metrics,
metrics::{
prometheus,
prometheus::{Counter, CounterVec, Opts, PrometheusError, Registry, U64},
},
};
/// Label for success counters.
pub const SUCCEEDED: &'static str = "succeeded";
/// Label for fail counters.
pub const FAILED: &'static str = "failed";
/// Dispute Distribution metrics.
#[derive(Clone, Default)]
pub struct Metrics(Option<MetricsInner>);
#[derive(Clone)]
struct MetricsInner {
/// Number of sent dispute requests (succeeded and failed).
sent_requests: CounterVec<U64>,
/// Number of requests received.
///
/// This is all requests coming in, regardless of whether they are processed or dropped.
received_requests: Counter<U64>,
/// Number of requests for which `ImportStatements` returned.
///
/// We both have successful imports and failed imports here.
imported_requests: CounterVec<U64>,
/// The duration of issued dispute request to response.
time_dispute_request: prometheus::Histogram,
}
impl Metrics {
/// Create new dummy metrics, not reporting anything.
pub fn new_dummy() -> Self {
Metrics(None)
}
/// Increment counter on finished request sending.
pub fn on_sent_request(&self, label: &'static str) {
if let Some(metrics) = &self.0 {
metrics.sent_requests.with_label_values(&[label]).inc()
}
}
/// Increment counter on served disputes.
pub fn on_received_request(&self) {
if let Some(metrics) = &self.0 {
metrics.received_requests.inc()
}
}
/// Statements have been imported.
pub fn on_imported(&self, label: &'static str, num_requests: usize) {
if let Some(metrics) = &self.0 {
metrics
.imported_requests
.with_label_values(&[label])
.inc_by(num_requests as u64)
}
}
/// Get a timer to time request/response duration.
pub fn time_dispute_request(&self) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.time_dispute_request.start_timer())
}
}
impl metrics::Metrics for Metrics {
fn try_register(registry: &Registry) -> Result<Self, PrometheusError> {
let metrics = MetricsInner {
sent_requests: prometheus::register(
CounterVec::new(
Opts::new(
"pezkuwi_teyrchain_dispute_distribution_sent_requests",
"Total number of sent requests.",
),
&["success"],
)?,
registry,
)?,
received_requests: prometheus::register(
Counter::new(
"pezkuwi_teyrchain_dispute_distribution_received_requests",
"Total number of received dispute requests.",
)?,
registry,
)?,
imported_requests: prometheus::register(
CounterVec::new(
Opts::new(
"pezkuwi_teyrchain_dispute_distribution_imported_requests",
"Total number of imported requests.",
),
&["success"],
)?,
registry,
)?,
time_dispute_request: prometheus::register(
prometheus::Histogram::with_opts(prometheus::HistogramOpts::new(
"pezkuwi_teyrchain_dispute_distribution_time_dispute_request",
"Time needed for dispute votes to get confirmed/fail getting transmitted.",
))?,
registry,
)?,
};
Ok(Metrics(Some(metrics)))
}
}
@@ -0,0 +1,209 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use std::{collections::HashMap, time::Instant};
use gum::CandidateHash;
use pezkuwi_node_network_protocol::{
request_response::{incoming::OutgoingResponseSender, v1::DisputeRequest},
PeerId,
};
use pezkuwi_node_primitives::SignedDisputeStatement;
use pezkuwi_primitives::{CandidateReceiptV2 as CandidateReceipt, ValidatorIndex};
use crate::receiver::{BATCH_COLLECTING_INTERVAL, MIN_KEEP_BATCH_ALIVE_VOTES};
use super::MAX_BATCH_LIFETIME;
/// A batch of votes to be imported into the `dispute-coordinator`.
///
/// Vote imports are way more efficient when performed in batches, hence we batch together incoming
/// votes until the rate of incoming votes falls below a threshold, then we import into the dispute
/// coordinator.
///
/// A `Batch` keeps track of the votes to be imported and the current incoming rate, on rate update
/// it will "flush" in case the incoming rate dropped too low, preparing the import.
pub struct Batch {
/// The actual candidate this batch is concerned with.
candidate_receipt: CandidateReceipt,
/// Cache of `CandidateHash` (candidate_receipt.hash()).
candidate_hash: CandidateHash,
/// All valid votes received in this batch so far.
///
/// We differentiate between valid and invalid votes, so we can detect (and drop) duplicates,
/// while still allowing validators to equivocate.
///
/// Detecting and rejecting duplicates is crucial in order to effectively enforce
/// `MIN_KEEP_BATCH_ALIVE_VOTES` per `BATCH_COLLECTING_INTERVAL`. If we would count duplicates
/// here, the mechanism would be broken.
valid_votes: HashMap<ValidatorIndex, SignedDisputeStatement>,
/// All invalid votes received in this batch so far.
invalid_votes: HashMap<ValidatorIndex, SignedDisputeStatement>,
/// How many votes have been batched since the last tick/creation.
votes_batched_since_last_tick: u32,
/// Expiry time for the batch.
///
/// By this time the latest this batch will get flushed.
best_before: Instant,
/// Requesters waiting for a response.
requesters: Vec<(PeerId, OutgoingResponseSender<DisputeRequest>)>,
}
/// Result of checking a batch every `BATCH_COLLECTING_INTERVAL`.
pub(super) enum TickResult {
/// Batch is still alive, please call `tick` again at the given `Instant`.
Alive(Batch, Instant),
/// Batch is done, ready for import!
Done(PreparedImport),
}
/// Ready for import.
pub struct PreparedImport {
pub candidate_receipt: CandidateReceipt,
pub statements: Vec<(SignedDisputeStatement, ValidatorIndex)>,
/// Information about original requesters.
pub requesters: Vec<(PeerId, OutgoingResponseSender<DisputeRequest>)>,
}
impl From<Batch> for PreparedImport {
fn from(batch: Batch) -> Self {
let Batch {
candidate_receipt,
valid_votes,
invalid_votes,
requesters: pending_responses,
..
} = batch;
let statements = valid_votes
.into_iter()
.chain(invalid_votes.into_iter())
.map(|(index, statement)| (statement, index))
.collect();
Self { candidate_receipt, statements, requesters: pending_responses }
}
}
impl Batch {
/// Create a new empty batch based on the given `CandidateReceipt`.
///
/// To create a `Batch` use Batches::find_batch`.
///
/// Arguments:
///
/// * `candidate_receipt` - The candidate this batch is meant to track votes for.
/// * `now` - current time stamp for calculating the first tick.
///
/// Returns: A batch and the first `Instant` you are supposed to call `tick`.
pub(super) fn new(candidate_receipt: CandidateReceipt, now: Instant) -> (Self, Instant) {
let s = Self {
candidate_hash: candidate_receipt.hash(),
candidate_receipt,
valid_votes: HashMap::new(),
invalid_votes: HashMap::new(),
votes_batched_since_last_tick: 0,
best_before: Instant::now() + MAX_BATCH_LIFETIME,
requesters: Vec::new(),
};
let next_tick = s.calculate_next_tick(now);
(s, next_tick)
}
/// Receipt of the candidate this batch is batching votes for.
pub fn candidate_receipt(&self) -> &CandidateReceipt {
&self.candidate_receipt
}
/// Add votes from a validator into the batch.
///
/// The statements are supposed to be the valid and invalid statements received in a
/// `DisputeRequest`.
///
/// The given `pending_response` is the corresponding response sender for responding to `peer`.
/// If at least one of the votes is new as far as this batch is concerned we record the
/// pending_response, for later use. In case both votes are known already, we return the
/// response sender as an `Err` value.
pub fn add_votes(
&mut self,
valid_vote: (SignedDisputeStatement, ValidatorIndex),
invalid_vote: (SignedDisputeStatement, ValidatorIndex),
peer: PeerId,
pending_response: OutgoingResponseSender<DisputeRequest>,
) -> Result<(), OutgoingResponseSender<DisputeRequest>> {
debug_assert!(valid_vote.0.candidate_hash() == invalid_vote.0.candidate_hash());
debug_assert!(valid_vote.0.candidate_hash() == &self.candidate_hash);
let mut duplicate = true;
if self.valid_votes.insert(valid_vote.1, valid_vote.0).is_none() {
self.votes_batched_since_last_tick += 1;
duplicate = false;
}
if self.invalid_votes.insert(invalid_vote.1, invalid_vote.0).is_none() {
self.votes_batched_since_last_tick += 1;
duplicate = false;
}
if duplicate {
Err(pending_response)
} else {
self.requesters.push((peer, pending_response));
Ok(())
}
}
/// Check batch for liveness.
///
/// This function is supposed to be called at instants given at construction and as returned as
/// part of `TickResult`.
pub(super) fn tick(mut self, now: Instant) -> TickResult {
if self.votes_batched_since_last_tick >= MIN_KEEP_BATCH_ALIVE_VOTES &&
now < self.best_before
{
// Still good:
let next_tick = self.calculate_next_tick(now);
// Reset counter:
self.votes_batched_since_last_tick = 0;
TickResult::Alive(self, next_tick)
} else {
TickResult::Done(PreparedImport::from(self))
}
}
/// Calculate when the next tick should happen.
///
/// This will usually return `now + BATCH_COLLECTING_INTERVAL`, except if the lifetime of this
/// batch would exceed `MAX_BATCH_LIFETIME`.
///
/// # Arguments
///
/// * `now` - The current time.
fn calculate_next_tick(&self, now: Instant) -> Instant {
let next_tick = now + BATCH_COLLECTING_INTERVAL;
if next_tick < self.best_before {
next_tick
} else {
self.best_before
}
}
}
@@ -0,0 +1,170 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use std::{
collections::{hash_map, HashMap},
time::{Duration, Instant},
};
use futures::future::pending;
use pezkuwi_node_network_protocol::request_response::DISPUTE_REQUEST_TIMEOUT;
use pezkuwi_primitives::{CandidateHash, CandidateReceiptV2 as CandidateReceipt};
use crate::{
receiver::batches::{batch::TickResult, waiting_queue::PendingWake},
LOG_TARGET,
};
pub use self::batch::{Batch, PreparedImport};
use self::waiting_queue::WaitingQueue;
use super::{
error::{JfyiError, JfyiResult},
BATCH_COLLECTING_INTERVAL,
};
/// A single batch (per candidate) as managed by `Batches`.
mod batch;
/// Queue events in time and wait for them to become ready.
mod waiting_queue;
/// Safe-guard in case votes trickle in real slow.
///
/// If the batch life time exceeded the time the sender is willing to wait for a confirmation, we
/// would trigger pointless re-sends.
const MAX_BATCH_LIFETIME: Duration = DISPUTE_REQUEST_TIMEOUT.saturating_sub(Duration::from_secs(2));
/// Limit the number of batches that can be alive at any given time.
///
/// Reasoning for this number, see guide.
pub const MAX_BATCHES: usize = 1000;
/// Manage batches.
///
/// - Batches can be found via `find_batch()` in order to add votes to them/check they exist.
/// - Batches can be checked for being ready for flushing in order to import contained votes.
pub struct Batches {
/// The batches we manage.
///
/// Kept invariants:
/// For each entry in `batches`, there exists an entry in `waiting_queue` as well - we wait on
/// all batches!
batches: HashMap<CandidateHash, Batch>,
/// Waiting queue for waiting for batches to become ready for `tick`.
///
/// Kept invariants by `Batches`:
/// For each entry in the `waiting_queue` there exists a corresponding entry in `batches`.
waiting_queue: WaitingQueue<CandidateHash>,
}
/// A found batch is either really found or got created so it can be found.
pub enum FoundBatch<'a> {
/// Batch just got created.
Created(&'a mut Batch),
/// Batch already existed.
Found(&'a mut Batch),
}
impl Batches {
/// Create new empty `Batches`.
pub fn new() -> Self {
debug_assert!(
MAX_BATCH_LIFETIME > BATCH_COLLECTING_INTERVAL,
"Unexpectedly low `MAX_BATCH_LIFETIME`, please check parameters."
);
Self { batches: HashMap::new(), waiting_queue: WaitingQueue::new() }
}
/// Find a particular batch.
///
/// That is either find it, or we create it as reflected by the result `FoundBatch`.
pub fn find_batch(
&mut self,
candidate_hash: CandidateHash,
candidate_receipt: CandidateReceipt,
) -> JfyiResult<FoundBatch<'_>> {
if self.batches.len() >= MAX_BATCHES {
return Err(JfyiError::MaxBatchLimitReached);
}
debug_assert!(candidate_hash == candidate_receipt.hash());
let result = match self.batches.entry(candidate_hash) {
hash_map::Entry::Vacant(vacant) => {
let now = Instant::now();
let (created, ready_at) = Batch::new(candidate_receipt, now);
let pending_wake = PendingWake { payload: candidate_hash, ready_at };
self.waiting_queue.push(pending_wake);
FoundBatch::Created(vacant.insert(created))
},
hash_map::Entry::Occupied(occupied) => FoundBatch::Found(occupied.into_mut()),
};
Ok(result)
}
/// Wait for the next `tick` to check for ready batches.
///
/// This function blocks (returns `Poll::Pending`) until at least one batch can be
/// checked for readiness meaning that `BATCH_COLLECTING_INTERVAL` has passed since the last
/// check for that batch or it reached end of life.
///
/// If this `Batches` instance is empty (does not actually contain any batches), then this
/// function will always return `Poll::Pending`.
///
/// Returns: A `Vec` of all `PreparedImport`s from batches that became ready.
pub async fn check_batches(&mut self) -> Vec<PreparedImport> {
let now = Instant::now();
let mut imports = Vec::new();
// Wait for at least one batch to become ready:
self.waiting_queue.wait_ready(now).await;
// Process all ready entries:
while let Some(wake) = self.waiting_queue.pop_ready(now) {
let batch = self.batches.remove(&wake.payload);
debug_assert!(
batch.is_some(),
"Entries referenced in `waiting_queue` are supposed to exist!"
);
let batch = match batch {
None => return pending().await,
Some(batch) => batch,
};
match batch.tick(now) {
TickResult::Done(import) => {
gum::trace!(
target: LOG_TARGET,
candidate_hash = ?wake.payload,
"Batch became ready."
);
imports.push(import);
},
TickResult::Alive(old_batch, next_tick) => {
gum::trace!(
target: LOG_TARGET,
candidate_hash = ?wake.payload,
"Batch found to be still alive on check."
);
let pending_wake = PendingWake { payload: wake.payload, ready_at: next_tick };
self.waiting_queue.push(pending_wake);
self.batches.insert(wake.payload, old_batch);
},
}
}
imports
}
}
@@ -0,0 +1,204 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use std::{cmp::Ordering, collections::BinaryHeap, time::Instant};
use futures::future::pending;
use futures_timer::Delay;
/// Wait asynchronously for given `Instant`s one after the other.
///
/// `PendingWake`s can be inserted and `WaitingQueue` makes `wait_ready()` to always wait for the
/// next `Instant` in the queue.
pub struct WaitingQueue<Payload> {
/// All pending wakes we are supposed to wait on in order.
pending_wakes: BinaryHeap<PendingWake<Payload>>,
/// Wait for next `PendingWake`.
timer: Option<Delay>,
}
/// Represents some event waiting to be processed at `ready_at`.
///
/// This is an event in `WaitingQueue`. It provides an `Ord` instance, that sorts descending with
/// regard to `Instant` (so we get a `min-heap` with the earliest `Instant` at the top).
#[derive(Eq, PartialEq)]
pub struct PendingWake<Payload> {
pub payload: Payload,
pub ready_at: Instant,
}
impl<Payload: Eq + Ord> WaitingQueue<Payload> {
/// Get a new empty `WaitingQueue`.
///
/// If you call `pop` on this queue immediately, it will always return `Poll::Pending`.
pub fn new() -> Self {
Self { pending_wakes: BinaryHeap::new(), timer: None }
}
/// Push a `PendingWake`.
///
/// The next call to `wait_ready` will make sure to wake soon enough to process that new event
/// in a timely manner.
pub fn push(&mut self, wake: PendingWake<Payload>) {
self.pending_wakes.push(wake);
// Reset timer as it is potentially obsolete now:
self.timer = None;
}
/// Pop the next ready item.
///
/// This function does not wait, if nothing is ready right now as determined by the passed
/// `now` time stamp, this function simply returns `None`.
pub fn pop_ready(&mut self, now: Instant) -> Option<PendingWake<Payload>> {
let is_ready = self.pending_wakes.peek().map_or(false, |p| p.ready_at <= now);
if is_ready {
Some(self.pending_wakes.pop().expect("We just peeked. qed."))
} else {
None
}
}
/// Don't pop, just wait until something is ready.
///
/// Once this function returns `Poll::Ready(())` `pop_ready()` will return `Some`, if passed
/// the same `Instant`.
///
/// Whether ready or not is determined based on the passed time stamp `now` which should be the
/// current time as returned by `Instant::now()`
///
/// This function waits asynchronously for an item to become ready. If there is no more item,
/// this call will wait forever (return Poll::Pending without scheduling a wake).
pub async fn wait_ready(&mut self, now: Instant) {
if let Some(timer) = &mut self.timer {
// Previous timer was not done yet.
timer.await
}
let next_waiting = self.pending_wakes.peek();
let is_ready = next_waiting.map_or(false, |p| p.ready_at <= now);
if is_ready {
return;
}
self.timer = next_waiting.map(|p| Delay::new(p.ready_at.duration_since(now)));
match &mut self.timer {
None => return pending().await,
Some(timer) => timer.await,
}
}
}
impl<Payload: Eq + Ord> PartialOrd<PendingWake<Payload>> for PendingWake<Payload> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl<Payload: Ord> Ord for PendingWake<Payload> {
fn cmp(&self, other: &Self) -> Ordering {
// Reverse order for min-heap:
match other.ready_at.cmp(&self.ready_at) {
Ordering::Equal => other.payload.cmp(&self.payload),
o => o,
}
}
}
#[cfg(test)]
mod tests {
use std::{
task::Poll,
time::{Duration, Instant},
};
use assert_matches::assert_matches;
use futures::{future::poll_fn, pin_mut, Future};
use crate::LOG_TARGET;
use super::{PendingWake, WaitingQueue};
#[test]
fn wait_ready_waits_for_earliest_event_always() {
sp_tracing::try_init_simple();
let mut queue = WaitingQueue::new();
let now = Instant::now();
let start = now;
queue.push(PendingWake { payload: 1u32, ready_at: now + Duration::from_millis(3) });
// Push another one in order:
queue.push(PendingWake { payload: 2u32, ready_at: now + Duration::from_millis(5) });
// Push one out of order:
queue.push(PendingWake { payload: 0u32, ready_at: now + Duration::from_millis(1) });
// Push another one at same timestamp (should become ready at the same time)
queue.push(PendingWake { payload: 10u32, ready_at: now + Duration::from_millis(1) });
futures::executor::block_on(async move {
// No time passed yet - nothing should be ready.
assert!(queue.pop_ready(now).is_none(), "No time has passed, nothing should be ready");
// Receive them in order at expected times:
queue.wait_ready(now).await;
gum::trace!(target: LOG_TARGET, "After first wait.");
let now = start + Duration::from_millis(1);
assert!(Instant::now() - start >= Duration::from_millis(1));
assert_eq!(queue.pop_ready(now).map(|p| p.payload), Some(0u32));
// One more should be ready:
assert_eq!(queue.pop_ready(now).map(|p| p.payload), Some(10u32));
assert!(queue.pop_ready(now).is_none(), "No more entry expected to be ready.");
queue.wait_ready(now).await;
gum::trace!(target: LOG_TARGET, "After second wait.");
let now = start + Duration::from_millis(3);
assert!(Instant::now() - start >= Duration::from_millis(3));
assert_eq!(queue.pop_ready(now).map(|p| p.payload), Some(1u32));
assert!(queue.pop_ready(now).is_none(), "No more entry expected to be ready.");
// Push in between wait:
poll_fn(|cx| {
let fut = queue.wait_ready(now);
pin_mut!(fut);
assert_matches!(fut.poll(cx), Poll::Pending);
Poll::Ready(())
})
.await;
queue.push(PendingWake { payload: 3u32, ready_at: start + Duration::from_millis(4) });
queue.wait_ready(now).await;
// Newly pushed element should have become ready:
gum::trace!(target: LOG_TARGET, "After third wait.");
let now = start + Duration::from_millis(4);
assert!(Instant::now() - start >= Duration::from_millis(4));
assert_eq!(queue.pop_ready(now).map(|p| p.payload), Some(3u32));
assert!(queue.pop_ready(now).is_none(), "No more entry expected to be ready.");
queue.wait_ready(now).await;
gum::trace!(target: LOG_TARGET, "After fourth wait.");
let now = start + Duration::from_millis(5);
assert!(Instant::now() - start >= Duration::from_millis(5));
assert_eq!(queue.pop_ready(now).map(|p| p.payload), Some(2u32));
assert!(queue.pop_ready(now).is_none(), "No more entry expected to be ready.");
// queue empty - should wait forever now:
poll_fn(|cx| {
let fut = queue.wait_ready(now);
pin_mut!(fut);
assert_matches!(fut.poll(cx), Poll::Pending);
Poll::Ready(())
})
.await;
});
}
}
@@ -0,0 +1,97 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//
//! Error handling related code and Error/Result definitions.
use fatality::Nested;
use gum::CandidateHash;
use pezkuwi_node_network_protocol::{request_response::incoming, PeerId};
use pezkuwi_node_subsystem_util::runtime;
use pezkuwi_primitives::AuthorityDiscoveryId;
use crate::LOG_TARGET;
#[allow(missing_docs)]
#[fatality::fatality(splitable)]
pub enum Error {
#[fatal(forward)]
#[error("Error while accessing runtime information")]
Runtime(#[from] runtime::Error),
#[fatal(forward)]
#[error("Retrieving next incoming request failed.")]
IncomingRequest(#[from] incoming::Error),
#[error("Sending back response to peers {0:#?} failed.")]
SendResponses(Vec<PeerId>),
#[error("Changing peer's ({0}) reputation failed.")]
SetPeerReputation(PeerId),
#[error("Dispute request with invalid signatures, from peer {0}.")]
InvalidSignature(PeerId),
#[error("Received votes from peer {0} have been completely redundant.")]
RedundantMessage(PeerId),
#[error("Import of dispute got canceled for candidate {0} - import failed for some reason.")]
ImportCanceled(CandidateHash),
#[error("Peer {0} attempted to participate in dispute and is not a validator.")]
NotAValidator(PeerId),
#[error("Force flush for batch that could not be found attempted, candidate hash: {0}")]
ForceFlushBatchDoesNotExist(CandidateHash),
// Should never happen in practice:
#[error("We needed to drop messages, because we reached limit on concurrent batches.")]
MaxBatchLimitReached,
#[error("Authority {0} sent messages at a too high rate.")]
AuthorityFlooding(AuthorityDiscoveryId),
}
pub type Result<T> = std::result::Result<T, Error>;
pub type JfyiResult<T> = std::result::Result<T, JfyiError>;
/// Utility for eating top level errors and log them.
///
/// We basically always want to try and continue on error. This utility function is meant to
/// consume top-level errors by simply logging them.
pub fn log_error(result: Result<()>) -> std::result::Result<(), FatalError> {
match result.into_nested()? {
Err(error @ JfyiError::ImportCanceled(_)) => {
gum::debug!(target: LOG_TARGET, error = ?error);
Ok(())
},
Err(JfyiError::NotAValidator(peer)) => {
gum::debug!(
target: LOG_TARGET,
?peer,
"Dropping message from peer (unknown authority id)"
);
Ok(())
},
Err(error) => {
gum::warn!(target: LOG_TARGET, error = ?error);
Ok(())
},
Ok(()) => Ok(()),
}
}
@@ -0,0 +1,522 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use std::{
pin::Pin,
task::{Context, Poll},
time::Duration,
};
use futures::{
channel::oneshot,
future::poll_fn,
pin_mut,
stream::{FuturesUnordered, StreamExt},
Future,
};
use gum::CandidateHash;
use pezkuwi_node_network_protocol::{
authority_discovery::AuthorityDiscovery,
request_response::{
incoming::{self, OutgoingResponse, OutgoingResponseSender},
v1::{DisputeRequest, DisputeResponse},
IncomingRequest, IncomingRequestReceiver,
},
PeerId, UnifiedReputationChange as Rep,
};
use pezkuwi_node_primitives::DISPUTE_WINDOW;
use pezkuwi_node_subsystem::{
messages::{DisputeCoordinatorMessage, ImportStatementsResult},
overseer,
};
use pezkuwi_node_subsystem_util::{runtime, runtime::RuntimeInfo};
use crate::{
metrics::{FAILED, SUCCEEDED},
Metrics, LOG_TARGET,
};
mod error;
/// Rate limiting queues for incoming requests by peers.
mod peer_queues;
/// Batch imports together.
mod batches;
use self::{
batches::{Batches, FoundBatch, PreparedImport},
error::{log_error, JfyiError, JfyiResult, Result},
peer_queues::PeerQueues,
};
const COST_INVALID_REQUEST: Rep = Rep::CostMajor("Received message could not be decoded.");
const COST_INVALID_SIGNATURE: Rep = Rep::Malicious("Signatures were invalid.");
const COST_NOT_A_VALIDATOR: Rep = Rep::CostMajor("Reporting peer was not a validator.");
/// Invalid imports can be caused by flooding, e.g. by a disabled validator.
const COST_INVALID_IMPORT: Rep =
Rep::CostMinor("Import was deemed invalid by dispute-coordinator.");
/// How many votes must have arrived in the last `BATCH_COLLECTING_INTERVAL`
///
/// in order for a batch to stay alive and not get flushed/imported to the dispute-coordinator.
///
/// This ensures a timely import of batches.
#[cfg(not(test))]
pub const MIN_KEEP_BATCH_ALIVE_VOTES: u32 = 10;
#[cfg(test)]
pub const MIN_KEEP_BATCH_ALIVE_VOTES: u32 = 2;
/// Time we allow to pass for new votes to trickle in.
///
/// See `MIN_KEEP_BATCH_ALIVE_VOTES` above.
/// Should be greater or equal to `RECEIVE_RATE_LIMIT` (there is no point in checking any faster).
pub const BATCH_COLLECTING_INTERVAL: Duration = Duration::from_millis(500);
/// State for handling incoming `DisputeRequest` messages.
pub struct DisputesReceiver<Sender, AD> {
/// Access to session information.
runtime: RuntimeInfo,
/// Subsystem sender for communication with other subsystems.
sender: Sender,
/// Channel to retrieve incoming requests from.
receiver: IncomingRequestReceiver<DisputeRequest>,
/// Rate limiting queue for each peer (only authorities).
peer_queues: PeerQueues,
/// Currently active batches of imports per candidate.
batches: Batches,
/// Authority discovery service:
authority_discovery: AD,
/// Imports currently being processed by the `dispute-coordinator`.
pending_imports: FuturesUnordered<PendingImport>,
/// Log received requests.
metrics: Metrics,
}
/// Messages as handled by this receiver internally.
enum MuxedMessage {
/// An import got confirmed by the coordinator.
///
/// We need to handle those for two reasons:
///
/// - We need to make sure responses are actually sent (therefore we need to await futures
/// promptly).
/// - We need to punish peers whose import got rejected.
ConfirmedImport(ImportResult),
/// A new request has arrived and should be handled.
NewRequest(IncomingRequest<DisputeRequest>),
/// Rate limit timer hit - is time to process one row of messages.
///
/// This is the result of calling `self.peer_queues.pop_reqs()`.
WakePeerQueuesPopReqs(Vec<IncomingRequest<DisputeRequest>>),
/// It is time to check batches.
///
/// Every `BATCH_COLLECTING_INTERVAL` we check whether less than `MIN_KEEP_BATCH_ALIVE_VOTES`
/// new votes arrived, if so the batch is ready for import.
///
/// This is the result of calling `self.batches.check_batches()`.
WakeCheckBatches(Vec<PreparedImport>),
}
impl<Sender, AD> DisputesReceiver<Sender, AD>
where
AD: AuthorityDiscovery,
Sender: overseer::DisputeDistributionSenderTrait,
{
/// Create a new receiver which can be `run`.
pub fn new(
sender: Sender,
receiver: IncomingRequestReceiver<DisputeRequest>,
authority_discovery: AD,
metrics: Metrics,
) -> Self {
let runtime = RuntimeInfo::new_with_config(runtime::Config {
keystore: None,
session_cache_lru_size: DISPUTE_WINDOW.get(),
});
Self {
runtime,
sender,
receiver,
peer_queues: PeerQueues::new(),
batches: Batches::new(),
authority_discovery,
pending_imports: FuturesUnordered::new(),
metrics,
}
}
/// Get that receiver started.
///
/// This is an endless loop and should be spawned into its own task.
pub async fn run(mut self) {
loop {
match log_error(self.run_inner().await) {
Ok(()) => {},
Err(fatal) => {
gum::debug!(
target: LOG_TARGET,
error = ?fatal,
"Shutting down"
);
return;
},
}
}
}
/// Actual work happening here in three phases:
///
/// 1. Receive and queue incoming messages until the rate limit timer hits.
/// 2. Do import/batching for the head of all queues.
/// 3. Check and flush any ready batches.
async fn run_inner(&mut self) -> Result<()> {
let msg = self.receive_message().await?;
match msg {
MuxedMessage::NewRequest(req) => {
// Phase 1:
self.metrics.on_received_request();
self.dispatch_to_queues(req).await?;
},
MuxedMessage::WakePeerQueuesPopReqs(reqs) => {
// Phase 2:
for req in reqs {
// No early return - we cannot cancel imports of one peer, because the import of
// another failed:
match log_error(self.start_import_or_batch(req).await) {
Ok(()) => {},
Err(fatal) => return Err(fatal.into()),
}
}
},
MuxedMessage::WakeCheckBatches(ready_imports) => {
// Phase 3:
self.import_ready_batches(ready_imports).await;
},
MuxedMessage::ConfirmedImport(import_result) => {
self.update_imported_requests_metrics(&import_result);
// Confirm imports to requesters/punish them on invalid imports:
send_responses_to_requesters(import_result).await?;
},
}
Ok(())
}
/// Receive one `MuxedMessage`.
///
///
/// Dispatching events to messages as they happen.
async fn receive_message(&mut self) -> Result<MuxedMessage> {
poll_fn(|ctx| {
// In case of Ready(None), we want to wait for pending requests:
if let Poll::Ready(Some(v)) = self.pending_imports.poll_next_unpin(ctx) {
return Poll::Ready(Ok(MuxedMessage::ConfirmedImport(v?)));
}
let rate_limited = self.peer_queues.pop_reqs();
pin_mut!(rate_limited);
// We poll rate_limit before batches, so we don't unnecessarily delay importing to
// batches.
if let Poll::Ready(reqs) = rate_limited.poll(ctx) {
return Poll::Ready(Ok(MuxedMessage::WakePeerQueuesPopReqs(reqs)));
}
let ready_batches = self.batches.check_batches();
pin_mut!(ready_batches);
if let Poll::Ready(ready_batches) = ready_batches.poll(ctx) {
return Poll::Ready(Ok(MuxedMessage::WakeCheckBatches(ready_batches)));
}
let next_req = self.receiver.recv(|| vec![COST_INVALID_REQUEST]);
pin_mut!(next_req);
if let Poll::Ready(r) = next_req.poll(ctx) {
return match r {
Err(e) => Poll::Ready(Err(incoming::Error::from(e).into())),
Ok(v) => Poll::Ready(Ok(MuxedMessage::NewRequest(v))),
};
}
Poll::Pending
})
.await
}
/// Process incoming requests.
///
/// - Check sender is authority
/// - Dispatch message to corresponding queue in `peer_queues`.
/// - If queue is full, drop message and change reputation of sender.
async fn dispatch_to_queues(&mut self, req: IncomingRequest<DisputeRequest>) -> JfyiResult<()> {
let peer = req.peer;
// Only accept messages from validators, in case there are multiple `AuthorityId`s, we
// just take the first one. On session boundaries this might allow validators to double
// their rate limit for a short period of time, which seems acceptable.
let authority_id = match self
.authority_discovery
.get_authority_ids_by_peer_id(peer)
.await
.and_then(|s| s.into_iter().next())
{
None => {
req.send_outgoing_response(OutgoingResponse {
result: Err(()),
reputation_changes: vec![COST_NOT_A_VALIDATOR],
sent_feedback: None,
})
.map_err(|_| JfyiError::SendResponses(vec![peer]))?;
return Err(JfyiError::NotAValidator(peer).into());
},
Some(auth_id) => auth_id,
};
// Queue request:
if let Err((authority_id, req)) = self.peer_queues.push_req(authority_id, req) {
gum::debug!(
target: LOG_TARGET,
?authority_id,
?peer,
"Peer hit the rate limit - dropping message."
);
req.send_outgoing_response(OutgoingResponse {
result: Err(()),
reputation_changes: vec![],
sent_feedback: None,
})
.map_err(|_| JfyiError::SendResponses(vec![peer]))?;
return Err(JfyiError::AuthorityFlooding(authority_id));
}
Ok(())
}
/// Start importing votes for the given request or batch.
///
/// Signature check and in case we already have an existing batch we import to that batch,
/// otherwise import to `dispute-coordinator` directly and open a batch.
async fn start_import_or_batch(
&mut self,
incoming: IncomingRequest<DisputeRequest>,
) -> Result<()> {
let IncomingRequest { peer, payload, pending_response } = incoming;
let info = self
.runtime
.get_session_info_by_index(
&mut self.sender,
payload.0.candidate_receipt.descriptor.relay_parent(),
payload.0.session_index,
)
.await?;
let votes_result = payload.0.try_into_signed_votes(&info.session_info);
let (candidate_receipt, valid_vote, invalid_vote) = match votes_result {
Err(()) => {
// Signature invalid:
pending_response
.send_outgoing_response(OutgoingResponse {
result: Err(()),
reputation_changes: vec![COST_INVALID_SIGNATURE],
sent_feedback: None,
})
.map_err(|_| JfyiError::SetPeerReputation(peer))?;
return Err(From::from(JfyiError::InvalidSignature(peer)));
},
Ok(votes) => votes,
};
let candidate_hash = *valid_vote.0.candidate_hash();
match self.batches.find_batch(candidate_hash, candidate_receipt)? {
FoundBatch::Created(batch) => {
// There was no entry yet - start import immediately:
gum::trace!(
target: LOG_TARGET,
?candidate_hash,
?peer,
"No batch yet - triggering immediate import"
);
let import = PreparedImport {
candidate_receipt: batch.candidate_receipt().clone(),
statements: vec![valid_vote, invalid_vote],
requesters: vec![(peer, pending_response)],
};
self.start_import(import).await;
},
FoundBatch::Found(batch) => {
gum::trace!(target: LOG_TARGET, ?candidate_hash, "Batch exists - batching request");
let batch_result =
batch.add_votes(valid_vote, invalid_vote, peer, pending_response);
if let Err(pending_response) = batch_result {
// We don't expect honest peers to send redundant votes within a single batch,
// as the timeout for retry is much higher. Still we don't want to punish the
// node as it might not be the node's fault. Some other (malicious) node could
// have been faster sending the same votes in order to harm the reputation of
// that honest node. Given that we already have a rate limit, if a validator
// chooses to waste available rate with redundant votes - so be it. The actual
// dispute resolution is unaffected.
gum::debug!(
target: LOG_TARGET,
?peer,
"Peer sent completely redundant votes within a single batch - that looks fishy!",
);
pending_response
.send_outgoing_response(OutgoingResponse {
// While we have seen duplicate votes, we cannot confirm as we don't
// know yet whether the batch is going to be confirmed, so we assume
// the worst. We don't want to push the pending response to the batch
// either as that would be unbounded, only limited by the rate limit.
result: Err(()),
reputation_changes: Vec::new(),
sent_feedback: None,
})
.map_err(|_| JfyiError::SendResponses(vec![peer]))?;
return Err(From::from(JfyiError::RedundantMessage(peer)));
}
},
}
Ok(())
}
/// Trigger import into the dispute-coordinator of ready batches (`PreparedImport`s).
async fn import_ready_batches(&mut self, ready_imports: Vec<PreparedImport>) {
for import in ready_imports {
self.start_import(import).await;
}
}
/// Start import and add response receiver to `pending_imports`.
async fn start_import(&mut self, import: PreparedImport) {
let PreparedImport { candidate_receipt, statements, requesters } = import;
let (session_index, candidate_hash) = match statements.iter().next() {
None => {
gum::debug!(
target: LOG_TARGET,
candidate_hash = ?candidate_receipt.hash(),
"Not importing empty batch"
);
return;
},
Some(vote) => (vote.0.session_index(), *vote.0.candidate_hash()),
};
let (pending_confirmation, confirmation_rx) = oneshot::channel();
self.sender
.send_message(DisputeCoordinatorMessage::ImportStatements {
candidate_receipt,
session: session_index,
statements,
pending_confirmation: Some(pending_confirmation),
})
.await;
let pending =
PendingImport { candidate_hash, requesters, pending_response: confirmation_rx };
self.pending_imports.push(pending);
}
fn update_imported_requests_metrics(&self, result: &ImportResult) {
let label = match result.result {
ImportStatementsResult::ValidImport => SUCCEEDED,
ImportStatementsResult::InvalidImport => FAILED,
};
self.metrics.on_imported(label, result.requesters.len());
}
}
async fn send_responses_to_requesters(import_result: ImportResult) -> JfyiResult<()> {
let ImportResult { requesters, result } = import_result;
let mk_response = match result {
ImportStatementsResult::ValidImport => || OutgoingResponse {
result: Ok(DisputeResponse::Confirmed),
reputation_changes: Vec::new(),
sent_feedback: None,
},
ImportStatementsResult::InvalidImport => || OutgoingResponse {
result: Err(()),
reputation_changes: vec![COST_INVALID_IMPORT],
sent_feedback: None,
},
};
let mut sending_failed_for = Vec::new();
for (peer, pending_response) in requesters {
if let Err(()) = pending_response.send_outgoing_response(mk_response()) {
sending_failed_for.push(peer);
}
}
if !sending_failed_for.is_empty() {
Err(JfyiError::SendResponses(sending_failed_for))
} else {
Ok(())
}
}
/// A future that resolves into an `ImportResult` when ready.
///
/// This future is used on `dispute-coordinator` import messages for the oneshot response receiver
/// to:
/// - Keep track of concerned `CandidateHash` for reporting errors.
/// - Keep track of requesting peers so we can confirm the import/punish them on invalid imports.
struct PendingImport {
candidate_hash: CandidateHash,
requesters: Vec<(PeerId, OutgoingResponseSender<DisputeRequest>)>,
pending_response: oneshot::Receiver<ImportStatementsResult>,
}
/// A `PendingImport` becomes an `ImportResult` once done.
struct ImportResult {
/// Requesters of that import.
requesters: Vec<(PeerId, OutgoingResponseSender<DisputeRequest>)>,
/// Actual result of the import.
result: ImportStatementsResult,
}
impl PendingImport {
async fn wait_for_result(&mut self) -> JfyiResult<ImportResult> {
let result = (&mut self.pending_response)
.await
.map_err(|_| JfyiError::ImportCanceled(self.candidate_hash))?;
Ok(ImportResult { requesters: std::mem::take(&mut self.requesters), result })
}
}
impl Future for PendingImport {
type Output = JfyiResult<ImportResult>;
fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
let fut = self.wait_for_result();
pin_mut!(fut);
fut.poll(cx)
}
}
@@ -0,0 +1,141 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use std::collections::{hash_map::Entry, HashMap, VecDeque};
use futures::future::pending;
use futures_timer::Delay;
use pezkuwi_node_network_protocol::request_response::{v1::DisputeRequest, IncomingRequest};
use pezkuwi_primitives::AuthorityDiscoveryId;
use crate::RECEIVE_RATE_LIMIT;
/// How many messages we are willing to queue per peer (validator).
///
/// The larger this value is, the larger bursts are allowed to be without us dropping messages. On
/// the flip side this gets allocated per validator, so for a size of 10 this will result
/// in `10_000 * size_of(IncomingRequest)` in the worst case.
///
/// `PEER_QUEUE_CAPACITY` must not be 0 for obvious reasons.
#[cfg(not(test))]
pub const PEER_QUEUE_CAPACITY: usize = 10;
#[cfg(test)]
pub const PEER_QUEUE_CAPACITY: usize = 2;
/// Queues for messages from authority peers for rate limiting.
///
/// Invariants ensured:
///
/// 1. No queue will ever have more than `PEER_QUEUE_CAPACITY` elements.
/// 2. There are no empty queues. Whenever a queue gets empty, it is removed. This way checking
/// whether there are any messages queued is cheap.
/// 3. As long as not empty, `pop_reqs` will, if called in sequence, not return `Ready` more often
/// than once for every `RECEIVE_RATE_LIMIT`, but it will always return Ready eventually.
/// 4. If empty `pop_reqs` will never return `Ready`, but will always be `Pending`.
pub struct PeerQueues {
/// Actual queues.
queues: HashMap<AuthorityDiscoveryId, VecDeque<IncomingRequest<DisputeRequest>>>,
/// Delay timer for establishing the rate limit.
rate_limit_timer: Option<Delay>,
}
impl PeerQueues {
/// New empty `PeerQueues`.
pub fn new() -> Self {
Self { queues: HashMap::new(), rate_limit_timer: None }
}
/// Push an incoming request for a given authority.
///
/// Returns: `Ok(())` if succeeded, `Err((args))` if capacity is reached.
pub fn push_req(
&mut self,
peer: AuthorityDiscoveryId,
req: IncomingRequest<DisputeRequest>,
) -> Result<(), (AuthorityDiscoveryId, IncomingRequest<DisputeRequest>)> {
let queue = match self.queues.entry(peer) {
Entry::Vacant(vacant) => vacant.insert(VecDeque::new()),
Entry::Occupied(occupied) => {
if occupied.get().len() >= PEER_QUEUE_CAPACITY {
return Err((occupied.key().clone(), req));
}
occupied.into_mut()
},
};
queue.push_back(req);
// We have at least one element to process - rate limit `timer` needs to exist now:
self.ensure_timer();
Ok(())
}
/// Pop all heads and return them for processing.
///
/// This gets one message from each peer that has sent at least one.
///
/// This function is rate limited, if called in sequence it will not return more often than
/// every `RECEIVE_RATE_LIMIT`.
///
/// NOTE: If empty this function will not return `Ready` at all, but will always be `Pending`.
pub async fn pop_reqs(&mut self) -> Vec<IncomingRequest<DisputeRequest>> {
self.wait_for_timer().await;
let mut heads = Vec::with_capacity(self.queues.len());
let old_queues = std::mem::replace(&mut self.queues, HashMap::new());
for (k, mut queue) in old_queues.into_iter() {
let front = queue.pop_front();
debug_assert!(front.is_some(), "Invariant that queues are never empty is broken.");
if let Some(front) = front {
heads.push(front);
}
if !queue.is_empty() {
self.queues.insert(k, queue);
}
}
if !self.is_empty() {
// Still not empty - we should get woken at some point.
self.ensure_timer();
}
heads
}
/// Whether or not all queues are empty.
pub fn is_empty(&self) -> bool {
self.queues.is_empty()
}
/// Ensure there is an active `timer`.
///
/// Checks whether one exists and if not creates one.
fn ensure_timer(&mut self) -> &mut Delay {
self.rate_limit_timer.get_or_insert(Delay::new(RECEIVE_RATE_LIMIT))
}
/// Wait for `timer` if it exists, or be `Pending` forever.
///
/// Afterwards it gets set back to `None`.
async fn wait_for_timer(&mut self) {
match self.rate_limit_timer.as_mut() {
None => pending().await,
Some(timer) => timer.await,
}
self.rate_limit_timer = None;
}
}
@@ -0,0 +1,73 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//
//! Error handling related code and Error/Result definitions.
use pezkuwi_node_primitives::disputes::DisputeMessageCheckError;
use pezkuwi_node_subsystem::SubsystemError;
use pezkuwi_node_subsystem_util::runtime;
#[allow(missing_docs)]
#[fatality::fatality(splitable)]
pub enum Error {
#[fatal]
#[error("Spawning subsystem task failed")]
SpawnTask(#[source] SubsystemError),
#[fatal(forward)]
#[error("Error while accessing runtime information")]
Runtime(#[from] runtime::Error),
/// We need available active heads for finding relevant authorities.
#[error("No active heads available - needed for finding relevant authorities.")]
NoActiveHeads,
/// This error likely indicates a bug in the coordinator.
#[error("Oneshot for asking dispute coordinator for active disputes got canceled.")]
AskActiveDisputesCanceled,
/// This error likely indicates a bug in the coordinator.
#[error("Oneshot for asking dispute coordinator for candidate votes got canceled.")]
AskCandidateVotesCanceled,
/// This error does indicate a bug in the coordinator.
///
/// We were not able to successfully construct a `DisputeMessage` from disputes votes.
#[error("Invalid dispute encountered")]
InvalidDisputeFromCoordinator(#[source] DisputeMessageCheckError),
/// This error does indicate a bug in the coordinator.
///
/// We did not receive votes on both sides for `CandidateVotes` received from the coordinator.
#[error("Missing votes for valid dispute")]
MissingVotesFromCoordinator,
/// This error does indicate a bug in the coordinator.
///
/// `SignedDisputeStatement` could not be reconstructed from recorded statements.
#[error("Invalid statements from coordinator")]
InvalidStatementFromCoordinator,
/// This error does indicate a bug in the coordinator.
///
/// A statement's `ValidatorIndex` could not be looked up.
#[error("ValidatorIndex of statement could not be found")]
InvalidValidatorIndexFromCoordinator,
}
pub type Result<T> = std::result::Result<T, Error>;
pub type JfyiErrorResult<T> = std::result::Result<T, JfyiError>;
@@ -0,0 +1,392 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use std::{
collections::{BTreeMap, HashMap, HashSet},
pin::Pin,
task::Poll,
time::Duration,
};
use futures::{channel::oneshot, future::poll_fn, Future};
use futures_timer::Delay;
use indexmap::{map::Entry, IndexMap};
use pezkuwi_node_network_protocol::request_response::v1::DisputeRequest;
use pezkuwi_node_primitives::{DisputeMessage, DisputeStatus};
use pezkuwi_node_subsystem::{
messages::DisputeCoordinatorMessage, overseer, ActiveLeavesUpdate, SubsystemSender,
};
use pezkuwi_node_subsystem_util::{nesting_sender::NestingSender, runtime::RuntimeInfo};
use pezkuwi_primitives::{CandidateHash, Hash, SessionIndex};
/// For each ongoing dispute we have a `SendTask` which takes care of it.
///
/// It is going to spawn real tasks as it sees fit for getting the votes of the particular dispute
/// out.
///
/// As we assume disputes have a priority, we start sending for disputes in the order
/// `start_sender` got called.
mod send_task;
use send_task::SendTask;
pub use send_task::TaskFinish;
/// Error and [`Result`] type for sender.
mod error;
pub use error::{Error, FatalError, JfyiError, Result};
use self::error::JfyiErrorResult;
use crate::{Metrics, LOG_TARGET, SEND_RATE_LIMIT};
/// Messages as sent by background tasks.
#[derive(Debug)]
pub enum DisputeSenderMessage {
/// A task finished.
TaskFinish(TaskFinish),
/// A request for active disputes to the dispute-coordinator finished.
ActiveDisputesReady(JfyiErrorResult<BTreeMap<(SessionIndex, CandidateHash), DisputeStatus>>),
}
/// The `DisputeSender` keeps track of all ongoing disputes we need to send statements out.
///
/// For each dispute a `SendTask` is responsible for sending to the concerned validators for that
/// particular dispute. The `DisputeSender` keeps track of those tasks, informs them about new
/// sessions/validator sets and cleans them up when they become obsolete.
///
/// The unit of work for the `DisputeSender` is a dispute, represented by `SendTask`s.
pub struct DisputeSender<M> {
/// All heads we currently consider active.
active_heads: Vec<Hash>,
/// List of currently active sessions.
///
/// Value is the hash that was used for the query.
active_sessions: HashMap<SessionIndex, Hash>,
/// All ongoing dispute sending this subsystem is aware of.
///
/// Using an `IndexMap` so items can be iterated in the order of insertion.
disputes: IndexMap<CandidateHash, SendTask<M>>,
/// Sender to be cloned for `SendTask`s.
tx: NestingSender<M, DisputeSenderMessage>,
/// `Some` if we are waiting for a response `DisputeCoordinatorMessage::ActiveDisputes`.
waiting_for_active_disputes: Option<WaitForActiveDisputesState>,
/// Future for delaying too frequent creation of dispute sending tasks.
rate_limit: RateLimit,
/// Metrics for reporting stats about sent requests.
metrics: Metrics,
}
/// State we keep while waiting for active disputes.
///
/// When we send `DisputeCoordinatorMessage::ActiveDisputes`, this is the state we keep while
/// waiting for the response.
struct WaitForActiveDisputesState {
/// Have we seen any new sessions since last refresh?
have_new_sessions: bool,
}
#[overseer::contextbounds(DisputeDistribution, prefix = self::overseer)]
impl<M: 'static + Send + Sync> DisputeSender<M> {
/// Create a new `DisputeSender` which can be used to start dispute sending.
pub fn new(tx: NestingSender<M, DisputeSenderMessage>, metrics: Metrics) -> Self {
Self {
active_heads: Vec::new(),
active_sessions: HashMap::new(),
disputes: IndexMap::new(),
tx,
waiting_for_active_disputes: None,
rate_limit: RateLimit::new(),
metrics,
}
}
/// Create a `SendTask` for a particular new dispute.
///
/// This function is rate-limited by `SEND_RATE_LIMIT`. It will block if called too frequently
/// in order to maintain the limit.
pub async fn start_sender<Context>(
&mut self,
ctx: &mut Context,
runtime: &mut RuntimeInfo,
msg: DisputeMessage,
) -> Result<()> {
let req: DisputeRequest = msg.into();
let candidate_hash = req.0.candidate_receipt.hash();
match self.disputes.entry(candidate_hash) {
Entry::Occupied(_) => {
gum::trace!(target: LOG_TARGET, ?candidate_hash, "Dispute sending already active.");
return Ok(());
},
Entry::Vacant(vacant) => {
self.rate_limit.limit("in start_sender", candidate_hash).await;
let send_task = SendTask::new(
ctx,
runtime,
&self.active_sessions,
NestingSender::new(self.tx.clone(), DisputeSenderMessage::TaskFinish),
req,
&self.metrics,
)
.await?;
vacant.insert(send_task);
},
}
Ok(())
}
/// Receive message from a background task.
pub async fn on_message<Context>(
&mut self,
ctx: &mut Context,
runtime: &mut RuntimeInfo,
msg: DisputeSenderMessage,
) -> Result<()> {
match msg {
DisputeSenderMessage::TaskFinish(msg) => {
let TaskFinish { candidate_hash, receiver, result } = msg;
self.metrics.on_sent_request(result.as_metrics_label());
let task = match self.disputes.get_mut(&candidate_hash) {
None => {
// Can happen when a dispute ends, with messages still in queue:
gum::trace!(
target: LOG_TARGET,
?result,
"Received `FromSendingTask::Finished` for non existing dispute."
);
return Ok(());
},
Some(task) => task,
};
task.on_finished_send(&receiver, result);
},
DisputeSenderMessage::ActiveDisputesReady(result) => {
let state = self.waiting_for_active_disputes.take();
let have_new_sessions = state.map(|s| s.have_new_sessions).unwrap_or(false);
let active_disputes = result?;
self.handle_new_active_disputes(ctx, runtime, active_disputes, have_new_sessions)
.await?;
},
}
Ok(())
}
/// Take care of a change in active leaves.
///
/// Update our knowledge on sessions and initiate fetching for new active disputes.
pub async fn update_leaves<Context>(
&mut self,
ctx: &mut Context,
runtime: &mut RuntimeInfo,
update: ActiveLeavesUpdate,
) -> Result<()> {
let ActiveLeavesUpdate { activated, deactivated } = update;
let deactivated: HashSet<_> = deactivated.into_iter().collect();
self.active_heads.retain(|h| !deactivated.contains(h));
self.active_heads.extend(activated.into_iter().map(|l| l.hash));
let have_new_sessions = self.refresh_sessions(ctx, runtime).await?;
// Not yet waiting for data, request an update:
match self.waiting_for_active_disputes.take() {
None => {
self.waiting_for_active_disputes =
Some(WaitForActiveDisputesState { have_new_sessions });
let mut sender = ctx.sender().clone();
let mut tx = self.tx.clone();
let get_active_disputes_task = async move {
let result = get_active_disputes(&mut sender).await;
let result =
tx.send_message(DisputeSenderMessage::ActiveDisputesReady(result)).await;
if let Err(err) = result {
gum::debug!(
target: LOG_TARGET,
?err,
"Sending `DisputeSenderMessage` from background task failed."
);
}
};
ctx.spawn("get_active_disputes", Box::pin(get_active_disputes_task))
.map_err(FatalError::SpawnTask)?;
},
Some(state) => {
let have_new_sessions = state.have_new_sessions || have_new_sessions;
let new_state = WaitForActiveDisputesState { have_new_sessions };
self.waiting_for_active_disputes = Some(new_state);
gum::debug!(
target: LOG_TARGET,
"Dispute coordinator slow? We are still waiting for data on next active leaves update."
);
},
}
Ok(())
}
/// Handle new active disputes response.
///
/// - Initiate a retry of failed sends which are still active.
/// - Get new authorities to send messages to.
/// - Get rid of obsolete tasks and disputes.
///
/// This function ensures the `SEND_RATE_LIMIT`, therefore it might block.
async fn handle_new_active_disputes<Context>(
&mut self,
ctx: &mut Context,
runtime: &mut RuntimeInfo,
active_disputes: BTreeMap<(SessionIndex, CandidateHash), DisputeStatus>,
have_new_sessions: bool,
) -> Result<()> {
let active_disputes: HashSet<_> =
active_disputes.into_iter().map(|((_, c), _)| c).collect();
// Cleanup obsolete senders (retain keeps order of remaining elements):
self.disputes
.retain(|candidate_hash, _| active_disputes.contains(candidate_hash));
// Iterates in order of insertion:
let mut should_rate_limit = true;
for (candidate_hash, dispute) in self.disputes.iter_mut() {
if have_new_sessions || dispute.has_failed_sends() {
if should_rate_limit {
self.rate_limit
.limit("while going through new sessions/failed sends", *candidate_hash)
.await;
}
let sends_happened = dispute
.refresh_sends(ctx, runtime, &self.active_sessions, &self.metrics)
.await?;
// Only rate limit if we actually sent something out _and_ it was not just because
// of errors on previous sends.
//
// Reasoning: It would not be acceptable to slow down the whole subsystem, just
// because of a few bad peers having problems. It is actually better to risk
// running into their rate limit in that case and accept a minor reputation change.
should_rate_limit = sends_happened && have_new_sessions;
}
}
Ok(())
}
/// Make active sessions correspond to currently active heads.
///
/// Returns: true if sessions changed.
async fn refresh_sessions<Context>(
&mut self,
ctx: &mut Context,
runtime: &mut RuntimeInfo,
) -> Result<bool> {
let new_sessions = get_active_session_indices(ctx, runtime, &self.active_heads).await?;
let new_sessions_raw: HashSet<_> = new_sessions.keys().collect();
let old_sessions_raw: HashSet<_> = self.active_sessions.keys().collect();
let updated = new_sessions_raw != old_sessions_raw;
// Update in any case, so we use current heads for queries:
self.active_sessions = new_sessions;
Ok(updated)
}
}
/// Rate limiting logic.
///
/// Suitable for the sending side.
struct RateLimit {
limit: Delay,
}
impl RateLimit {
/// Create new `RateLimit` that is immediately ready.
fn new() -> Self {
// Start with an empty duration, as there has not been any previous call.
Self { limit: Delay::new(Duration::new(0, 0)) }
}
/// Initialized with actual `SEND_RATE_LIMIT` duration.
fn new_limit() -> Self {
Self { limit: Delay::new(SEND_RATE_LIMIT) }
}
/// Wait until ready and prepare for next call.
///
/// String given as occasion and candidate hash are logged in case the rate limit hit.
async fn limit(&mut self, occasion: &'static str, candidate_hash: CandidateHash) {
// Wait for rate limit and add some logging:
let mut num_wakes: u32 = 0;
poll_fn(|cx| {
let old_limit = Pin::new(&mut self.limit);
match old_limit.poll(cx) {
Poll::Pending => {
gum::debug!(
target: LOG_TARGET,
?occasion,
?candidate_hash,
?num_wakes,
"Sending rate limit hit, slowing down requests"
);
num_wakes += 1;
Poll::Pending
},
Poll::Ready(()) => Poll::Ready(()),
}
})
.await;
*self = Self::new_limit();
}
}
/// Retrieve the currently active sessions.
///
/// List is all indices of all active sessions together with the head that was used for the query.
#[overseer::contextbounds(DisputeDistribution, prefix = self::overseer)]
async fn get_active_session_indices<Context>(
ctx: &mut Context,
runtime: &mut RuntimeInfo,
active_heads: &Vec<Hash>,
) -> Result<HashMap<SessionIndex, Hash>> {
let mut indices = HashMap::new();
// Iterate all heads we track as active and fetch the child' session indices.
for head in active_heads {
let session_index = runtime.get_session_index_for_child(ctx.sender(), *head).await?;
// Cache session info
if let Err(err) =
runtime.get_session_info_by_index(ctx.sender(), *head, session_index).await
{
gum::debug!(target: LOG_TARGET, ?err, ?session_index, "Can't cache SessionInfo");
}
indices.insert(session_index, *head);
}
Ok(indices)
}
/// Retrieve Set of active disputes from the dispute coordinator.
async fn get_active_disputes<Sender>(
sender: &mut Sender,
) -> JfyiErrorResult<BTreeMap<(SessionIndex, CandidateHash), DisputeStatus>>
where
Sender: SubsystemSender<DisputeCoordinatorMessage>,
{
let (tx, rx) = oneshot::channel();
sender.send_message(DisputeCoordinatorMessage::ActiveDisputes(tx)).await;
rx.await.map_err(|_| JfyiError::AskActiveDisputesCanceled)
}
@@ -0,0 +1,328 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use std::collections::{HashMap, HashSet};
use futures::{Future, FutureExt};
use pezkuwi_node_network_protocol::{
request_response::{
outgoing::RequestError,
v1::{DisputeRequest, DisputeResponse},
OutgoingRequest, OutgoingResult, Recipient, Requests,
},
IfDisconnected,
};
use pezkuwi_node_subsystem::{messages::NetworkBridgeTxMessage, overseer};
use pezkuwi_node_subsystem_util::{metrics, nesting_sender::NestingSender, runtime::RuntimeInfo};
use pezkuwi_primitives::{AuthorityDiscoveryId, CandidateHash, Hash, SessionIndex, ValidatorIndex};
use super::error::{FatalError, Result};
use crate::{
metrics::{FAILED, SUCCEEDED},
Metrics, LOG_TARGET,
};
/// Delivery status for a particular dispute.
///
/// Keeps track of all the validators that have to be reached for a dispute.
///
/// The unit of work for a `SendTask` is an authority/validator.
pub struct SendTask<M> {
/// The request we are supposed to get out to all `teyrchain` validators of the dispute's
/// session and to all current authorities.
request: DisputeRequest,
/// The set of authorities we need to send our messages to. This set will change at session
/// boundaries. It will always be at least the `teyrchain` validators of the session where the
/// dispute happened and the authorities of the current sessions as determined by active heads.
deliveries: HashMap<AuthorityDiscoveryId, DeliveryStatus>,
/// Whether we have any tasks failed since the last refresh.
has_failed_sends: bool,
/// Sender to be cloned for tasks.
tx: NestingSender<M, TaskFinish>,
}
/// Status of a particular vote/statement delivery to a particular validator.
enum DeliveryStatus {
/// Request is still in flight.
Pending,
/// Succeeded - no need to send request to this peer anymore.
Succeeded,
}
/// A sending task finishes with this result:
#[derive(Debug)]
pub struct TaskFinish {
/// The candidate this task was running for.
pub candidate_hash: CandidateHash,
/// The authority the request was sent to.
pub receiver: AuthorityDiscoveryId,
/// The result of the delivery attempt.
pub result: TaskResult,
}
#[derive(Debug)]
pub enum TaskResult {
/// Task succeeded in getting the request to its peer.
Succeeded,
/// Task was not able to get the request out to its peer.
///
/// It should be retried in that case.
Failed(RequestError),
}
impl TaskResult {
pub fn as_metrics_label(&self) -> &'static str {
match self {
Self::Succeeded => SUCCEEDED,
Self::Failed(_) => FAILED,
}
}
}
#[overseer::contextbounds(DisputeDistribution, prefix = self::overseer)]
impl<M: 'static + Send + Sync> SendTask<M> {
/// Initiates sending a dispute message to peers.
///
/// Creation of new `SendTask`s is subject to rate limiting. As each `SendTask` will trigger
/// sending a message to each validator, hence for employing a per-peer rate limit, we need to
/// limit the construction of new `SendTask`s.
pub async fn new<Context>(
ctx: &mut Context,
runtime: &mut RuntimeInfo,
active_sessions: &HashMap<SessionIndex, Hash>,
tx: NestingSender<M, TaskFinish>,
request: DisputeRequest,
metrics: &Metrics,
) -> Result<Self> {
let mut send_task =
Self { request, deliveries: HashMap::new(), has_failed_sends: false, tx };
send_task.refresh_sends(ctx, runtime, active_sessions, metrics).await?;
Ok(send_task)
}
/// Make sure we are sending to all relevant authorities.
///
/// This function is called at construction and should also be called whenever a session change
/// happens and on a regular basis to ensure we are retrying failed attempts.
///
/// This might resend to validators and is thus subject to any rate limiting we might want.
/// Calls to this function for different instances should be rate limited according to
/// `SEND_RATE_LIMIT`.
///
/// Returns: `True` if this call resulted in new requests.
pub async fn refresh_sends<Context>(
&mut self,
ctx: &mut Context,
runtime: &mut RuntimeInfo,
active_sessions: &HashMap<SessionIndex, Hash>,
metrics: &Metrics,
) -> Result<bool> {
let new_authorities = self.get_relevant_validators(ctx, runtime, active_sessions).await?;
// Note this will also contain all authorities for which sending failed previously:
let add_authorities: Vec<_> = new_authorities
.iter()
.filter(|a| !self.deliveries.contains_key(a))
.map(Clone::clone)
.collect();
// Get rid of dead/irrelevant tasks/statuses:
gum::trace!(
target: LOG_TARGET,
already_running_deliveries = ?self.deliveries.len(),
"Cleaning up deliveries"
);
self.deliveries.retain(|k, _| new_authorities.contains(k));
// Start any new tasks that are needed:
gum::trace!(
target: LOG_TARGET,
new_and_failed_authorities = ?add_authorities.len(),
overall_authority_set_size = ?new_authorities.len(),
already_running_deliveries = ?self.deliveries.len(),
"Starting new send requests for authorities."
);
let new_statuses =
send_requests(ctx, self.tx.clone(), add_authorities, self.request.clone(), metrics)
.await?;
let was_empty = new_statuses.is_empty();
gum::trace!(
target: LOG_TARGET,
sent_requests = ?new_statuses.len(),
"Requests dispatched."
);
self.has_failed_sends = false;
self.deliveries.extend(new_statuses.into_iter());
Ok(!was_empty)
}
/// Whether any sends have failed since the last refresh.
pub fn has_failed_sends(&self) -> bool {
self.has_failed_sends
}
/// Handle a finished response waiting task.
///
/// Called by `DisputeSender` upon reception of the corresponding message from our spawned
/// `wait_response_task`.
pub fn on_finished_send(&mut self, authority: &AuthorityDiscoveryId, result: TaskResult) {
match result {
TaskResult::Failed(err) => {
gum::trace!(
target: LOG_TARGET,
?authority,
candidate_hash = %self.request.0.candidate_receipt.hash(),
%err,
"Error sending dispute statements to node."
);
self.has_failed_sends = true;
// Remove state, so we know what to try again:
self.deliveries.remove(authority);
},
TaskResult::Succeeded => {
let status = match self.deliveries.get_mut(&authority) {
None => {
// Can happen when a sending became irrelevant while the response was
// already queued.
gum::debug!(
target: LOG_TARGET,
candidate = ?self.request.0.candidate_receipt.hash(),
?authority,
?result,
"Received `FromSendingTask::Finished` for non existing task."
);
return;
},
Some(status) => status,
};
// We are done here:
*status = DeliveryStatus::Succeeded;
},
}
}
/// Determine all validators that should receive the given dispute requests.
///
/// This is all `teyrchain` validators of the session the candidate occurred and all authorities
/// of all currently active sessions, determined by currently active heads.
async fn get_relevant_validators<Context>(
&self,
ctx: &mut Context,
runtime: &mut RuntimeInfo,
active_sessions: &HashMap<SessionIndex, Hash>,
) -> Result<HashSet<AuthorityDiscoveryId>> {
let ref_head = self.request.0.candidate_receipt.descriptor.relay_parent();
// Retrieve all authorities which participated in the teyrchain consensus of the session
// in which the candidate was backed.
let info = runtime
.get_session_info_by_index(ctx.sender(), ref_head, self.request.0.session_index)
.await?;
let session_info = &info.session_info;
let validator_count = session_info.validators.len();
let mut authorities: HashSet<_> = session_info
.discovery_keys
.iter()
.take(validator_count)
.enumerate()
.filter(|(i, _)| Some(ValidatorIndex(*i as _)) != info.validator_info.our_index)
.map(|(_, v)| v.clone())
.collect();
// Retrieve all authorities for the current session as indicated by the active
// heads we are tracking.
for (session_index, head) in active_sessions.iter() {
let info =
runtime.get_session_info_by_index(ctx.sender(), *head, *session_index).await?;
let session_info = &info.session_info;
let new_set = session_info
.discovery_keys
.iter()
.enumerate()
.filter(|(i, _)| Some(ValidatorIndex(*i as _)) != info.validator_info.our_index)
.map(|(_, v)| v.clone());
authorities.extend(new_set);
}
Ok(authorities)
}
}
/// Start sending of the given message to all given authorities.
///
/// And spawn tasks for handling the response.
#[overseer::contextbounds(DisputeDistribution, prefix = self::overseer)]
async fn send_requests<Context, M: 'static + Send + Sync>(
ctx: &mut Context,
tx: NestingSender<M, TaskFinish>,
receivers: Vec<AuthorityDiscoveryId>,
req: DisputeRequest,
metrics: &Metrics,
) -> Result<HashMap<AuthorityDiscoveryId, DeliveryStatus>> {
let mut statuses = HashMap::with_capacity(receivers.len());
let mut reqs = Vec::with_capacity(receivers.len());
for receiver in receivers {
let (outgoing, pending_response) =
OutgoingRequest::new(Recipient::Authority(receiver.clone()), req.clone());
reqs.push(Requests::DisputeSendingV1(outgoing));
let fut = wait_response_task(
pending_response,
req.0.candidate_receipt.hash(),
receiver.clone(),
tx.clone(),
metrics.time_dispute_request(),
);
ctx.spawn("dispute-sender", fut.boxed()).map_err(FatalError::SpawnTask)?;
statuses.insert(receiver, DeliveryStatus::Pending);
}
let msg = NetworkBridgeTxMessage::SendRequests(reqs, IfDisconnected::ImmediateError);
ctx.send_message(msg).await;
Ok(statuses)
}
/// Future to be spawned in a task for awaiting a response.
async fn wait_response_task<M: 'static + Send + Sync>(
pending_response: impl Future<Output = OutgoingResult<DisputeResponse>>,
candidate_hash: CandidateHash,
receiver: AuthorityDiscoveryId,
mut tx: NestingSender<M, TaskFinish>,
_timer: Option<metrics::prometheus::prometheus::HistogramTimer>,
) {
let result = pending_response.await;
let msg = match result {
Err(err) => TaskFinish { candidate_hash, receiver, result: TaskResult::Failed(err) },
Ok(DisputeResponse::Confirmed) =>
TaskFinish { candidate_hash, receiver, result: TaskResult::Succeeded },
};
if let Err(err) = tx.send_message(msg).await {
gum::debug!(
target: LOG_TARGET,
%err,
"Failed to notify subsystem about dispute sending result."
);
}
}
@@ -0,0 +1,230 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//
//! Mock data and utility functions for unit tests in this subsystem.
use std::{
collections::{HashMap, HashSet},
sync::{Arc, LazyLock},
time::Instant,
};
use async_trait::async_trait;
use pezkuwi_node_network_protocol::{authority_discovery::AuthorityDiscovery, PeerId};
use sc_keystore::LocalKeystore;
use sp_application_crypto::AppCrypto;
use sp_keyring::Sr25519Keyring;
use sp_keystore::{Keystore, KeystorePtr};
use pezkuwi_node_primitives::{DisputeMessage, SignedDisputeStatement};
use pezkuwi_primitives::{
AuthorityDiscoveryId, CandidateHash, CandidateReceiptV2 as CandidateReceipt, Hash,
SessionIndex, SessionInfo, ValidatorId, ValidatorIndex,
};
use pezkuwi_primitives_test_helpers::dummy_candidate_descriptor_v2;
use crate::LOG_TARGET;
pub const MOCK_SESSION_INDEX: SessionIndex = 1;
pub const MOCK_NEXT_SESSION_INDEX: SessionIndex = 2;
pub const MOCK_VALIDATORS: [Sr25519Keyring; 6] = [
Sr25519Keyring::Ferdie,
Sr25519Keyring::Alice,
Sr25519Keyring::Bob,
Sr25519Keyring::Charlie,
Sr25519Keyring::Dave,
Sr25519Keyring::Eve,
];
pub const MOCK_AUTHORITIES_NEXT_SESSION: [Sr25519Keyring; 2] =
[Sr25519Keyring::One, Sr25519Keyring::Two];
pub const FERDIE_INDEX: ValidatorIndex = ValidatorIndex(0);
pub const ALICE_INDEX: ValidatorIndex = ValidatorIndex(1);
pub const BOB_INDEX: ValidatorIndex = ValidatorIndex(2);
pub const CHARLIE_INDEX: ValidatorIndex = ValidatorIndex(3);
/// Mocked `AuthorityDiscovery` service.
pub static MOCK_AUTHORITY_DISCOVERY: LazyLock<MockAuthorityDiscovery> =
LazyLock::new(|| MockAuthorityDiscovery::new());
// Creating an innocent looking `SessionInfo` is really expensive in a debug build. Around
// 700ms on my machine, We therefore cache those keys here:
pub static MOCK_VALIDATORS_DISCOVERY_KEYS: LazyLock<HashMap<Sr25519Keyring, AuthorityDiscoveryId>> =
LazyLock::new(|| {
MOCK_VALIDATORS
.iter()
.chain(MOCK_AUTHORITIES_NEXT_SESSION.iter())
.map(|v| (*v, v.public().into()))
.collect()
});
pub static FERDIE_DISCOVERY_KEY: LazyLock<AuthorityDiscoveryId> =
LazyLock::new(|| MOCK_VALIDATORS_DISCOVERY_KEYS.get(&Sr25519Keyring::Ferdie).unwrap().clone());
pub static MOCK_SESSION_INFO: LazyLock<SessionInfo> = LazyLock::new(|| SessionInfo {
validators: MOCK_VALIDATORS.iter().take(4).map(|k| k.public().into()).collect(),
discovery_keys: MOCK_VALIDATORS
.iter()
.map(|k| MOCK_VALIDATORS_DISCOVERY_KEYS.get(&k).unwrap().clone())
.collect(),
assignment_keys: vec![],
validator_groups: Default::default(),
n_cores: 0,
zeroth_delay_tranche_width: 0,
relay_vrf_modulo_samples: 0,
n_delay_tranches: 0,
no_show_slots: 0,
needed_approvals: 0,
active_validator_indices: vec![],
dispute_period: 6,
random_seed: [0u8; 32],
});
/// `SessionInfo` for the second session. (No more validators, but two more authorities.
pub static MOCK_NEXT_SESSION_INFO: LazyLock<SessionInfo> = LazyLock::new(|| SessionInfo {
discovery_keys: MOCK_AUTHORITIES_NEXT_SESSION
.iter()
.map(|k| MOCK_VALIDATORS_DISCOVERY_KEYS.get(&k).unwrap().clone())
.collect(),
validators: Default::default(),
assignment_keys: vec![],
validator_groups: Default::default(),
n_cores: 0,
zeroth_delay_tranche_width: 0,
relay_vrf_modulo_samples: 0,
n_delay_tranches: 0,
no_show_slots: 0,
needed_approvals: 0,
active_validator_indices: vec![],
dispute_period: 6,
random_seed: [0u8; 32],
});
pub fn make_candidate_receipt(relay_parent: Hash) -> CandidateReceipt {
CandidateReceipt {
descriptor: dummy_candidate_descriptor_v2(relay_parent),
commitments_hash: Hash::random(),
}
}
pub fn make_explicit_signed(
validator: Sr25519Keyring,
candidate_hash: CandidateHash,
valid: bool,
) -> SignedDisputeStatement {
let keystore: KeystorePtr = Arc::new(LocalKeystore::in_memory());
Keystore::sr25519_generate_new(&*keystore, ValidatorId::ID, Some(&validator.to_seed()))
.expect("Insert key into keystore");
SignedDisputeStatement::sign_explicit(
&keystore,
valid,
candidate_hash,
MOCK_SESSION_INDEX,
validator.public().into(),
)
.expect("Keystore should be fine.")
.expect("Signing should work.")
}
pub fn make_dispute_message(
candidate: CandidateReceipt,
valid_validator: ValidatorIndex,
invalid_validator: ValidatorIndex,
) -> DisputeMessage {
let candidate_hash = candidate.hash();
let before_request = Instant::now();
let valid_vote =
make_explicit_signed(MOCK_VALIDATORS[valid_validator.0 as usize], candidate_hash, true);
gum::trace!(
"Passed time for valid vote: {:#?}",
Instant::now().saturating_duration_since(before_request)
);
let before_request = Instant::now();
let invalid_vote =
make_explicit_signed(MOCK_VALIDATORS[invalid_validator.0 as usize], candidate_hash, false);
gum::trace!(
"Passed time for invalid vote: {:#?}",
Instant::now().saturating_duration_since(before_request)
);
DisputeMessage::from_signed_statements(
valid_vote,
valid_validator,
invalid_vote,
invalid_validator,
candidate,
&MOCK_SESSION_INFO,
)
.expect("DisputeMessage construction should work.")
}
/// Dummy `AuthorityDiscovery` service.
#[derive(Debug, Clone)]
pub struct MockAuthorityDiscovery {
peer_ids: HashMap<Sr25519Keyring, PeerId>,
}
impl MockAuthorityDiscovery {
pub fn new() -> Self {
let mut peer_ids = HashMap::new();
peer_ids.insert(Sr25519Keyring::Alice, PeerId::random());
peer_ids.insert(Sr25519Keyring::Bob, PeerId::random());
peer_ids.insert(Sr25519Keyring::Ferdie, PeerId::random());
peer_ids.insert(Sr25519Keyring::Charlie, PeerId::random());
peer_ids.insert(Sr25519Keyring::Dave, PeerId::random());
peer_ids.insert(Sr25519Keyring::Eve, PeerId::random());
peer_ids.insert(Sr25519Keyring::One, PeerId::random());
peer_ids.insert(Sr25519Keyring::Two, PeerId::random());
Self { peer_ids }
}
pub fn get_peer_id_by_authority(&self, authority: Sr25519Keyring) -> PeerId {
*self.peer_ids.get(&authority).expect("Tester only picks valid authorities")
}
}
#[async_trait]
impl AuthorityDiscovery for MockAuthorityDiscovery {
async fn get_addresses_by_authority_id(
&mut self,
_authority: pezkuwi_primitives::AuthorityDiscoveryId,
) -> Option<HashSet<sc_network::Multiaddr>> {
panic!("Not implemented");
}
async fn get_authority_ids_by_peer_id(
&mut self,
peer_id: pezkuwi_node_network_protocol::PeerId,
) -> Option<HashSet<pezkuwi_primitives::AuthorityDiscoveryId>> {
for (a, p) in self.peer_ids.iter() {
if p == &peer_id {
let result =
HashSet::from([MOCK_VALIDATORS_DISCOVERY_KEYS.get(&a).unwrap().clone()]);
gum::trace!(
target: LOG_TARGET,
%peer_id,
?result,
"Returning authority ids for peer id"
);
return Some(result);
}
}
None
}
}
@@ -0,0 +1,901 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//
//! Subsystem unit tests
use std::{
collections::{BTreeMap, HashSet},
task::Poll,
time::{Duration, Instant},
};
use assert_matches::assert_matches;
use codec::{Decode, Encode};
use futures::{
channel::oneshot,
future::{poll_fn, ready},
pin_mut, Future,
};
use futures_timer::Delay;
use sc_network::{config::RequestResponseConfig, ProtocolName};
use pezkuwi_node_network_protocol::{
request_response::{v1::DisputeRequest, IncomingRequest, ReqProtocolNames},
PeerId,
};
use sp_keyring::Sr25519Keyring;
use pezkuwi_node_network_protocol::{
request_response::{v1::DisputeResponse, Recipient, Requests},
IfDisconnected,
};
use pezkuwi_node_primitives::DisputeStatus;
use pezkuwi_node_subsystem::{
messages::{
AllMessages, DisputeCoordinatorMessage, DisputeDistributionMessage, ImportStatementsResult,
NetworkBridgeTxMessage, RuntimeApiMessage, RuntimeApiRequest,
},
ActiveLeavesUpdate, FromOrchestra, OverseerSignal,
};
use pezkuwi_node_subsystem_test_helpers::{
mock::{make_ferdie_keystore, new_leaf},
subsystem_test_harness, TestSubsystemContextHandle,
};
use pezkuwi_primitives::{
AuthorityDiscoveryId, Block, CandidateHash, CandidateReceiptV2 as CandidateReceipt,
ExecutorParams, Hash, NodeFeatures, SessionIndex, SessionInfo,
};
use self::mock::{
make_candidate_receipt, make_dispute_message, ALICE_INDEX, FERDIE_DISCOVERY_KEY, FERDIE_INDEX,
MOCK_AUTHORITY_DISCOVERY, MOCK_NEXT_SESSION_INDEX, MOCK_NEXT_SESSION_INFO, MOCK_SESSION_INDEX,
MOCK_SESSION_INFO,
};
use crate::{
receiver::BATCH_COLLECTING_INTERVAL,
tests::mock::{BOB_INDEX, CHARLIE_INDEX},
DisputeDistributionSubsystem, Metrics, LOG_TARGET, SEND_RATE_LIMIT,
};
/// Useful mock providers.
pub mod mock;
#[test]
fn send_dispute_sends_dispute() {
let test = |mut handle: TestSubsystemContextHandle<DisputeDistributionMessage>, _req_cfg| async move {
let _ = handle_subsystem_startup(&mut handle, None).await;
let relay_parent = Hash::random();
let candidate = make_candidate_receipt(relay_parent);
send_dispute(&mut handle, candidate).await;
conclude(&mut handle).await;
};
test_harness(test);
}
#[test]
fn send_honors_rate_limit() {
sp_tracing::try_init_simple();
let test = |mut handle: TestSubsystemContextHandle<DisputeDistributionMessage>, _req_cfg| async move {
let _ = handle_subsystem_startup(&mut handle, None).await;
let relay_parent = Hash::random();
let candidate = make_candidate_receipt(relay_parent);
let before_request = Instant::now();
send_dispute(&mut handle, candidate).await;
// First send should not be rate limited:
gum::trace!("Passed time: {:#?}", Instant::now().saturating_duration_since(before_request));
// This test would likely be flaky on CI:
//assert!(Instant::now().saturating_duration_since(before_request) < SEND_RATE_LIMIT);
let relay_parent = Hash::random();
let candidate = make_candidate_receipt(relay_parent);
send_dispute(&mut handle, candidate).await;
// Second send should be rate limited:
gum::trace!(
"Passed time for send_dispute: {:#?}",
Instant::now().saturating_duration_since(before_request)
);
assert!(Instant::now() - before_request >= SEND_RATE_LIMIT);
conclude(&mut handle).await;
};
test_harness(test);
}
/// Helper for sending a new dispute to dispute-distribution sender and handling resulting messages.
async fn send_dispute(
handle: &mut TestSubsystemContextHandle<DisputeDistributionMessage>,
candidate: CandidateReceipt,
) {
let before_request = Instant::now();
let message = make_dispute_message(candidate.clone(), ALICE_INDEX, FERDIE_INDEX);
gum::trace!(
"Passed time for making message: {:#?}",
Instant::now().saturating_duration_since(before_request)
);
let before_request = Instant::now();
handle
.send(FromOrchestra::Communication {
msg: DisputeDistributionMessage::SendDispute(message.clone()),
})
.await;
gum::trace!(
"Passed time for sending message: {:#?}",
Instant::now().saturating_duration_since(before_request)
);
let expected_receivers = {
let info = &MOCK_SESSION_INFO;
info.discovery_keys
.clone()
.into_iter()
.filter(|a| a != &Sr25519Keyring::Ferdie.public().into())
.collect()
// All validators are also authorities in the first session, so we are
// done here.
};
check_sent_requests(handle, expected_receivers, true).await;
}
// Things to test:
// x Request triggers import
// x Subsequent imports get batched
// x Batch gets flushed.
// x Batch gets renewed.
// x Non authority requests get dropped.
// x Sending rate limit is honored.
// x Receiving rate limit is honored.
// x Duplicate requests on batch are dropped
#[test]
fn received_non_authorities_are_dropped() {
let test = |mut handle: TestSubsystemContextHandle<DisputeDistributionMessage>,
mut req_cfg: RequestResponseConfig| async move {
let req_tx = req_cfg.inbound_queue.as_mut().unwrap();
let _ = handle_subsystem_startup(&mut handle, None).await;
let relay_parent = Hash::random();
let candidate = make_candidate_receipt(relay_parent);
let message = make_dispute_message(candidate.clone(), ALICE_INDEX, FERDIE_INDEX);
// Non validator request should get dropped:
let rx_response =
send_network_dispute_request(req_tx, PeerId::random(), message.clone().into()).await;
assert_matches!(
rx_response.await,
Ok(resp) => {
let sc_network::config::OutgoingResponse {
result: _,
reputation_changes,
sent_feedback: _,
} = resp;
// Peer should get punished:
assert_eq!(reputation_changes.len(), 1);
}
);
conclude(&mut handle).await;
};
test_harness(test);
}
#[test]
fn received_request_triggers_import() {
let test = |mut handle: TestSubsystemContextHandle<DisputeDistributionMessage>,
mut req_cfg: RequestResponseConfig| async move {
let req_tx = req_cfg.inbound_queue.as_mut().unwrap();
let _ = handle_subsystem_startup(&mut handle, None).await;
let relay_parent = Hash::random();
let candidate = make_candidate_receipt(relay_parent);
let message = make_dispute_message(candidate.clone(), ALICE_INDEX, FERDIE_INDEX);
nested_network_dispute_request(
&mut handle,
req_tx,
MOCK_AUTHORITY_DISCOVERY.get_peer_id_by_authority(Sr25519Keyring::Alice),
message.clone().into(),
ImportStatementsResult::ValidImport,
true,
move |_handle, _req_tx, _message| ready(()),
)
.await;
gum::trace!(target: LOG_TARGET, "Concluding.");
conclude(&mut handle).await;
};
test_harness(test);
}
#[test]
fn batching_works() {
let test = |mut handle: TestSubsystemContextHandle<DisputeDistributionMessage>,
mut req_cfg: RequestResponseConfig| async move {
let req_tx = req_cfg.inbound_queue.as_mut().unwrap();
let _ = handle_subsystem_startup(&mut handle, None).await;
let relay_parent = Hash::random();
let candidate = make_candidate_receipt(relay_parent);
let message = make_dispute_message(candidate.clone(), ALICE_INDEX, FERDIE_INDEX);
// Initial request should get forwarded immediately:
nested_network_dispute_request(
&mut handle,
req_tx,
MOCK_AUTHORITY_DISCOVERY.get_peer_id_by_authority(Sr25519Keyring::Alice),
message.clone().into(),
ImportStatementsResult::ValidImport,
true,
move |_handle, _req_tx, _message| ready(()),
)
.await;
let mut rx_responses = Vec::new();
let message = make_dispute_message(candidate.clone(), BOB_INDEX, FERDIE_INDEX);
let peer = MOCK_AUTHORITY_DISCOVERY.get_peer_id_by_authority(Sr25519Keyring::Bob);
rx_responses.push(send_network_dispute_request(req_tx, peer, message.clone().into()).await);
let message = make_dispute_message(candidate.clone(), CHARLIE_INDEX, FERDIE_INDEX);
let peer = MOCK_AUTHORITY_DISCOVERY.get_peer_id_by_authority(Sr25519Keyring::Charlie);
rx_responses.push(send_network_dispute_request(req_tx, peer, message.clone().into()).await);
gum::trace!("Imported 3 votes into batch");
Delay::new(BATCH_COLLECTING_INTERVAL);
gum::trace!("Batch should still be alive");
// Batch should still be alive (2 new votes):
// Let's import two more votes, but fully duplicates - should not extend batch live.
gum::trace!("Importing duplicate votes");
let mut rx_responses_duplicate = Vec::new();
let message = make_dispute_message(candidate.clone(), BOB_INDEX, FERDIE_INDEX);
let peer = MOCK_AUTHORITY_DISCOVERY.get_peer_id_by_authority(Sr25519Keyring::Bob);
rx_responses_duplicate
.push(send_network_dispute_request(req_tx, peer, message.clone().into()).await);
let message = make_dispute_message(candidate.clone(), CHARLIE_INDEX, FERDIE_INDEX);
let peer = MOCK_AUTHORITY_DISCOVERY.get_peer_id_by_authority(Sr25519Keyring::Charlie);
rx_responses_duplicate
.push(send_network_dispute_request(req_tx, peer, message.clone().into()).await);
for rx_response in rx_responses_duplicate {
assert_matches!(
rx_response.await,
Ok(resp) => {
let sc_network::config::OutgoingResponse {
result,
reputation_changes,
sent_feedback: _,
} = resp;
gum::trace!(
target: LOG_TARGET,
?reputation_changes,
"Received reputation changes."
);
// We don't punish on that.
assert_eq!(reputation_changes.len(), 0);
assert_matches!(result, Err(()));
}
);
}
Delay::new(BATCH_COLLECTING_INTERVAL).await;
gum::trace!("Batch should be ready now (only duplicates have been added)");
let pending_confirmation = assert_matches!(
handle.recv().await,
AllMessages::DisputeCoordinator(
DisputeCoordinatorMessage::ImportStatements {
candidate_receipt: _,
session,
statements,
pending_confirmation: Some(pending_confirmation),
}
) => {
assert_eq!(session, MOCK_SESSION_INDEX);
assert_eq!(statements.len(), 3);
pending_confirmation
}
);
pending_confirmation.send(ImportStatementsResult::ValidImport).unwrap();
for rx_response in rx_responses {
assert_matches!(
rx_response.await,
Ok(resp) => {
let sc_network::config::OutgoingResponse {
result,
reputation_changes: _,
sent_feedback,
} = resp;
let result = result.unwrap();
let decoded =
<DisputeResponse as Decode>::decode(&mut result.as_slice()).unwrap();
assert!(decoded == DisputeResponse::Confirmed);
if let Some(sent_feedback) = sent_feedback {
sent_feedback.send(()).unwrap();
}
gum::trace!(
target: LOG_TARGET,
"Valid import happened."
);
}
);
}
gum::trace!(target: LOG_TARGET, "Concluding.");
conclude(&mut handle).await;
};
test_harness(test);
}
#[test]
fn receive_rate_limit_is_enforced() {
let test = |mut handle: TestSubsystemContextHandle<DisputeDistributionMessage>,
mut req_cfg: RequestResponseConfig| async move {
let req_tx = req_cfg.inbound_queue.as_mut().unwrap();
let _ = handle_subsystem_startup(&mut handle, None).await;
let relay_parent = Hash::random();
let candidate = make_candidate_receipt(relay_parent);
let message = make_dispute_message(candidate.clone(), ALICE_INDEX, FERDIE_INDEX);
// Initial request should get forwarded immediately:
nested_network_dispute_request(
&mut handle,
req_tx,
MOCK_AUTHORITY_DISCOVERY.get_peer_id_by_authority(Sr25519Keyring::Alice),
message.clone().into(),
ImportStatementsResult::ValidImport,
true,
move |_handle, _req_tx, _message| ready(()),
)
.await;
let mut rx_responses = Vec::new();
let peer = MOCK_AUTHORITY_DISCOVERY.get_peer_id_by_authority(Sr25519Keyring::Bob);
let message = make_dispute_message(candidate.clone(), BOB_INDEX, FERDIE_INDEX);
rx_responses.push(send_network_dispute_request(req_tx, peer, message.clone().into()).await);
let message = make_dispute_message(candidate.clone(), CHARLIE_INDEX, FERDIE_INDEX);
rx_responses.push(send_network_dispute_request(req_tx, peer, message.clone().into()).await);
gum::trace!("Import one too much:");
let message = make_dispute_message(candidate.clone(), CHARLIE_INDEX, ALICE_INDEX);
let rx_response_flood =
send_network_dispute_request(req_tx, peer, message.clone().into()).await;
assert_matches!(
rx_response_flood.await,
Ok(resp) => {
let sc_network::config::OutgoingResponse {
result,
reputation_changes: _,
sent_feedback: _,
} = resp;
// Received error because of flood.
assert!(!result.is_ok());
}
);
gum::trace!("Need to wait 2 patch intervals:");
Delay::new(BATCH_COLLECTING_INTERVAL).await;
Delay::new(BATCH_COLLECTING_INTERVAL).await;
gum::trace!("Batch should be ready now");
let pending_confirmation = assert_matches!(
handle.recv().await,
AllMessages::DisputeCoordinator(
DisputeCoordinatorMessage::ImportStatements {
candidate_receipt: _,
session,
statements,
pending_confirmation: Some(pending_confirmation),
}
) => {
assert_eq!(session, MOCK_SESSION_INDEX);
// Only 3 as fourth was flood:
assert_eq!(statements.len(), 3);
pending_confirmation
}
);
pending_confirmation.send(ImportStatementsResult::ValidImport).unwrap();
for rx_response in rx_responses {
assert_matches!(
rx_response.await,
Ok(resp) => {
let sc_network::config::OutgoingResponse {
result,
reputation_changes: _,
sent_feedback,
} = resp;
let result = result.unwrap();
let decoded =
<DisputeResponse as Decode>::decode(&mut result.as_slice()).unwrap();
assert!(decoded == DisputeResponse::Confirmed);
if let Some(sent_feedback) = sent_feedback {
sent_feedback.send(()).unwrap();
}
gum::trace!(
target: LOG_TARGET,
"Valid import happened."
);
}
);
}
gum::trace!(target: LOG_TARGET, "Concluding.");
conclude(&mut handle).await;
};
test_harness(test);
}
#[test]
fn send_dispute_gets_cleaned_up() {
let test = |mut handle: TestSubsystemContextHandle<DisputeDistributionMessage>, _| async move {
let old_head = handle_subsystem_startup(&mut handle, None).await;
let relay_parent = Hash::random();
let candidate = make_candidate_receipt(relay_parent);
let message = make_dispute_message(candidate.clone(), ALICE_INDEX, FERDIE_INDEX);
handle
.send(FromOrchestra::Communication {
msg: DisputeDistributionMessage::SendDispute(message.clone()),
})
.await;
let expected_receivers = {
let info = &MOCK_SESSION_INFO;
info.discovery_keys
.clone()
.into_iter()
.filter(|a| a != &Sr25519Keyring::Ferdie.public().into())
.collect()
// All validators are also authorities in the first session, so we are
// done here.
};
check_sent_requests(&mut handle, expected_receivers, false).await;
// Give tasks a chance to finish:
Delay::new(Duration::from_millis(20)).await;
activate_leaf(
&mut handle,
Hash::random(),
Some(old_head),
MOCK_SESSION_INDEX,
None,
// No disputes any more:
BTreeMap::new(),
)
.await;
// Yield, so subsystem can make progress:
Delay::new(Duration::from_millis(2)).await;
conclude(&mut handle).await;
};
test_harness(test);
}
#[test]
fn dispute_retries_and_works_across_session_boundaries() {
sp_tracing::try_init_simple();
let test = |mut handle: TestSubsystemContextHandle<DisputeDistributionMessage>, _| async move {
let old_head = handle_subsystem_startup(&mut handle, None).await;
let relay_parent = Hash::random();
let candidate = make_candidate_receipt(relay_parent);
let message = make_dispute_message(candidate.clone(), ALICE_INDEX, FERDIE_INDEX);
handle
.send(FromOrchestra::Communication {
msg: DisputeDistributionMessage::SendDispute(message.clone()),
})
.await;
let expected_receivers: HashSet<_> = {
let info = &MOCK_SESSION_INFO;
info.discovery_keys
.clone()
.into_iter()
.filter(|a| a != &Sr25519Keyring::Ferdie.public().into())
.collect()
// All validators are also authorities in the first session, so we are
// done here.
};
// Requests don't get confirmed - dispute is carried over to next session.
check_sent_requests(&mut handle, expected_receivers.clone(), false).await;
// Give tasks a chance to finish:
Delay::new(Duration::from_millis(20)).await;
// Trigger retry:
let old_head2 = Hash::random();
activate_leaf(
&mut handle,
old_head2,
Some(old_head),
MOCK_SESSION_INDEX,
None,
BTreeMap::from([((MOCK_SESSION_INDEX, candidate.hash()), DisputeStatus::Active)]),
)
.await;
check_sent_requests(&mut handle, expected_receivers.clone(), false).await;
// Give tasks a chance to finish:
Delay::new(Duration::from_millis(20)).await;
// Session change:
activate_leaf(
&mut handle,
Hash::random(),
Some(old_head2),
MOCK_NEXT_SESSION_INDEX,
Some(MOCK_NEXT_SESSION_INFO.clone()),
BTreeMap::from([((MOCK_SESSION_INDEX, candidate.hash()), DisputeStatus::Active)]),
)
.await;
let expected_receivers = {
let validator_count = MOCK_SESSION_INFO.validators.len();
let old_validators = MOCK_SESSION_INFO
.discovery_keys
.clone()
.into_iter()
.take(validator_count)
.filter(|a| *a != *FERDIE_DISCOVERY_KEY);
MOCK_NEXT_SESSION_INFO
.discovery_keys
.clone()
.into_iter()
.filter(|a| *a != *FERDIE_DISCOVERY_KEY)
.chain(old_validators)
.collect()
};
check_sent_requests(&mut handle, expected_receivers, true).await;
conclude(&mut handle).await;
};
test_harness(test);
}
async fn send_network_dispute_request(
req_tx: &mut async_channel::Sender<sc_network::config::IncomingRequest>,
peer: PeerId,
message: DisputeRequest,
) -> oneshot::Receiver<sc_network::config::OutgoingResponse> {
let (pending_response, rx_response) = oneshot::channel();
let req =
sc_network::config::IncomingRequest { peer, payload: message.encode(), pending_response };
req_tx.send(req).await.unwrap();
rx_response
}
/// Send request and handle its reactions.
///
/// Passed in function will be called while votes are still being imported.
async fn nested_network_dispute_request<'a, F, O>(
handle: &'a mut TestSubsystemContextHandle<DisputeDistributionMessage>,
req_tx: &'a mut async_channel::Sender<sc_network::config::IncomingRequest>,
peer: PeerId,
message: DisputeRequest,
import_result: ImportStatementsResult,
need_session_info: bool,
inner: F,
) where
F: FnOnce(
&'a mut TestSubsystemContextHandle<DisputeDistributionMessage>,
&'a mut async_channel::Sender<sc_network::config::IncomingRequest>,
DisputeRequest,
) -> O
+ 'a,
O: Future<Output = ()> + 'a,
{
let rx_response = send_network_dispute_request(req_tx, peer, message.clone().into()).await;
if need_session_info {
// Subsystem might need `SessionInfo` for determining indices:
match handle.recv().await {
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
_,
RuntimeApiRequest::SessionInfo(_, tx),
)) => {
tx.send(Ok(Some(MOCK_SESSION_INFO.clone())))
.expect("Receiver should stay alive.");
},
unexpected => panic!("Unexpected message {:?}", unexpected),
}
match handle.recv().await {
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
_,
RuntimeApiRequest::SessionExecutorParams(_, tx),
)) => {
tx.send(Ok(Some(ExecutorParams::default())))
.expect("Receiver should stay alive.");
},
unexpected => panic!("Unexpected message {:?}", unexpected),
}
match handle.recv().await {
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
_,
RuntimeApiRequest::NodeFeatures(_, si_tx),
)) => {
si_tx.send(Ok(NodeFeatures::EMPTY)).unwrap();
},
unexpected => panic!("Unexpected message {:?}", unexpected),
}
}
// Import should get initiated:
let pending_confirmation = assert_matches!(
handle.recv().await,
AllMessages::DisputeCoordinator(
DisputeCoordinatorMessage::ImportStatements {
candidate_receipt,
session,
statements,
pending_confirmation: Some(pending_confirmation),
}
) => {
let candidate_hash = candidate_receipt.hash();
assert_eq!(session, MOCK_SESSION_INDEX);
assert_eq!(candidate_hash, message.0.candidate_receipt.hash());
assert_eq!(statements.len(), 2);
pending_confirmation
}
);
// Do the inner thing:
inner(handle, req_tx, message).await;
// Confirm import
pending_confirmation.send(import_result).unwrap();
assert_matches!(
rx_response.await,
Ok(resp) => {
let sc_network::config::OutgoingResponse {
result,
reputation_changes,
sent_feedback,
} = resp;
match import_result {
ImportStatementsResult::ValidImport => {
let result = result.unwrap();
let decoded =
<DisputeResponse as Decode>::decode(&mut result.as_slice()).unwrap();
assert!(decoded == DisputeResponse::Confirmed);
if let Some(sent_feedback) = sent_feedback {
sent_feedback.send(()).unwrap();
}
gum::trace!(
target: LOG_TARGET,
"Valid import happened."
);
}
ImportStatementsResult::InvalidImport => {
// Peer should get punished:
assert_eq!(reputation_changes.len(), 1);
}
}
}
);
}
async fn conclude(handle: &mut TestSubsystemContextHandle<DisputeDistributionMessage>) {
// No more messages should be in the queue:
poll_fn(|ctx| {
let fut = handle.recv();
pin_mut!(fut);
// No requests should be initiated, as there is no longer any dispute active:
assert_matches!(fut.poll(ctx), Poll::Pending, "No requests expected");
Poll::Ready(())
})
.await;
handle.send(FromOrchestra::Signal(OverseerSignal::Conclude)).await;
}
/// Pass a `new_session` if you expect the subsystem to retrieve `SessionInfo` when given the
/// `session_index`.
async fn activate_leaf(
handle: &mut TestSubsystemContextHandle<DisputeDistributionMessage>,
activate: Hash,
deactivate: Option<Hash>,
session_index: SessionIndex,
// New session if we expect the subsystem to request it.
new_session: Option<SessionInfo>,
// Currently active disputes to send to the subsystem.
active_disputes: BTreeMap<(SessionIndex, CandidateHash), DisputeStatus>,
) {
handle
.send(FromOrchestra::Signal(OverseerSignal::ActiveLeaves(ActiveLeavesUpdate {
activated: Some(new_leaf(activate, 10)),
deactivated: deactivate.into_iter().collect(),
})))
.await;
assert_matches!(
handle.recv().await,
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
h,
RuntimeApiRequest::SessionIndexForChild(tx)
)) => {
assert_eq!(h, activate);
tx.send(Ok(session_index)).expect("Receiver should stay alive.");
}
);
if let Some(session_info) = new_session {
assert_matches!(
handle.recv().await,
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
h,
RuntimeApiRequest::SessionInfo(session_idx, tx)
)) => {
assert_eq!(h, activate);
assert_eq!(session_index, session_idx);
tx.send(Ok(Some(session_info))).expect("Receiver should stay alive.");
}
);
assert_matches!(
handle.recv().await,
AllMessages::RuntimeApi(RuntimeApiMessage::Request(
h,
RuntimeApiRequest::SessionExecutorParams(session_idx, tx)
)) => {
assert_eq!(h, activate);
assert_eq!(session_index, session_idx);
tx.send(Ok(Some(ExecutorParams::default()))).expect("Receiver should stay alive.");
}
);
assert_matches!(
handle.recv().await,
AllMessages::RuntimeApi(
RuntimeApiMessage::Request(_, RuntimeApiRequest::NodeFeatures(_, si_tx), )
) => {
si_tx.send(Ok(NodeFeatures::EMPTY)).unwrap();
}
);
}
assert_matches!(
handle.recv().await,
AllMessages::DisputeCoordinator(DisputeCoordinatorMessage::ActiveDisputes(tx)) => {
tx.send(active_disputes).expect("Receiver should stay alive.");
}
);
}
/// Check whether sent network bridge requests match the expectation.
async fn check_sent_requests(
handle: &mut TestSubsystemContextHandle<DisputeDistributionMessage>,
expected_receivers: HashSet<AuthorityDiscoveryId>,
confirm_receive: bool,
) {
let expected_receivers: HashSet<_> =
expected_receivers.into_iter().map(Recipient::Authority).collect();
// Sends to concerned validators:
assert_matches!(
handle.recv().await,
AllMessages::NetworkBridgeTx(
NetworkBridgeTxMessage::SendRequests(reqs, IfDisconnected::ImmediateError)
) => {
let reqs: Vec<_> = reqs.into_iter().map(|r|
assert_matches!(
r,
Requests::DisputeSendingV1(req) => {req}
)
)
.collect();
let receivers_raw: Vec<_> = reqs.iter().map(|r| r.peer.clone()).collect();
let receivers: HashSet<_> = receivers_raw.clone().clone().into_iter().collect();
assert_eq!(receivers_raw.len(), receivers.len(), "No duplicates are expected.");
assert_eq!(receivers.len(), expected_receivers.len());
assert_eq!(receivers, expected_receivers);
if confirm_receive {
for req in reqs {
req.pending_response.send(
Ok((DisputeResponse::Confirmed.encode(), ProtocolName::from("")))
)
.expect("Subsystem should be listening for a response.");
}
}
}
);
}
/// Initialize subsystem and return request sender needed for sending incoming requests to the
/// subsystem.
async fn handle_subsystem_startup(
handle: &mut TestSubsystemContextHandle<DisputeDistributionMessage>,
ongoing_dispute: Option<CandidateHash>,
) -> Hash {
let relay_parent = Hash::random();
activate_leaf(
handle,
relay_parent,
None,
MOCK_SESSION_INDEX,
Some(MOCK_SESSION_INFO.clone()),
ongoing_dispute
.into_iter()
.map(|c| ((MOCK_SESSION_INDEX, c), DisputeStatus::Active))
.collect(),
)
.await;
relay_parent
}
/// Launch subsystem and provided test function
///
/// which simulates the overseer.
fn test_harness<TestFn, Fut>(test: TestFn)
where
TestFn: FnOnce(
TestSubsystemContextHandle<DisputeDistributionMessage>,
RequestResponseConfig,
) -> Fut,
Fut: Future<Output = ()>,
{
sp_tracing::try_init_simple();
let keystore = make_ferdie_keystore();
let genesis_hash = Hash::repeat_byte(0xff);
let req_protocol_names = ReqProtocolNames::new(&genesis_hash, None);
let (req_receiver, req_cfg) = IncomingRequest::get_config_receiver::<
Block,
sc_network::NetworkWorker<Block, Hash>,
>(&req_protocol_names);
let subsystem = DisputeDistributionSubsystem::new(
keystore,
req_receiver,
MOCK_AUTHORITY_DISCOVERY.clone(),
Metrics::new_dummy(),
);
let subsystem = |ctx| async {
match subsystem.run(ctx).await {
Ok(()) => {},
Err(fatal) => {
gum::debug!(
target: LOG_TARGET,
?fatal,
"Dispute distribution exited with fatal error."
);
},
}
};
subsystem_test_harness(|handle| test(handle, req_cfg), subsystem);
}
@@ -0,0 +1,57 @@
[package]
name = "pezkuwi-gossip-support"
version = "7.0.0"
description = "Pezkuwi Gossip Support subsystem. Responsible for keeping track of session changes and issuing a connection request to the relevant validators on every new session."
authors.workspace = true
edition.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
[lints]
workspace = true
[dependencies]
sc-network = { workspace = true, default-features = true }
sp-application-crypto = { workspace = true, default-features = true }
sp-core = { workspace = true, default-features = true }
sp-crypto-hashing = { workspace = true, default-features = true }
sp-keystore = { workspace = true, default-features = true }
pezkuwi-node-network-protocol = { workspace = true, default-features = true }
pezkuwi-node-subsystem = { workspace = true, default-features = true }
pezkuwi-node-subsystem-util = { workspace = true, default-features = true }
pezkuwi-primitives = { workspace = true, default-features = true }
futures = { workspace = true }
futures-timer = { workspace = true }
gum = { workspace = true, default-features = true }
rand = { workspace = true }
rand_chacha = { workspace = true }
[dev-dependencies]
sp-authority-discovery = { workspace = true, default-features = true }
sp-consensus-babe = { workspace = true, default-features = true }
sp-keyring = { workspace = true, default-features = true }
sp-tracing = { workspace = true, default-features = true }
pezkuwi-node-subsystem-test-helpers = { workspace = true }
assert_matches = { workspace = true }
async-trait = { workspace = true }
parking_lot = { workspace = true, default-features = true }
quickcheck = { workspace = true, default-features = true }
[features]
runtime-benchmarks = [
"gum/runtime-benchmarks",
"pezkuwi-node-network-protocol/runtime-benchmarks",
"pezkuwi-node-subsystem-test-helpers/runtime-benchmarks",
"pezkuwi-node-subsystem-util/runtime-benchmarks",
"pezkuwi-node-subsystem/runtime-benchmarks",
"pezkuwi-primitives/runtime-benchmarks",
"sc-network/runtime-benchmarks",
"sp-authority-discovery/runtime-benchmarks",
"sp-consensus-babe/runtime-benchmarks",
"sp-keyring/runtime-benchmarks",
]
@@ -0,0 +1,891 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//! This subsystem is responsible for keeping track of session changes
//! and issuing a connection request to the relevant validators
//! on every new session.
//!
//! In addition to that, it creates a gossip overlay topology
//! which limits the amount of messages sent and received
//! to be an order of sqrt of the validators. Our neighbors
//! in this graph will be forwarded to the network bridge with
//! the `NetworkBridgeRxMessage::NewGossipTopology` message.
use std::{
collections::{HashMap, HashSet},
fmt,
time::{Duration, Instant},
u32,
};
use futures::{channel::oneshot, select, FutureExt as _};
use futures_timer::Delay;
use rand::{Rng, SeedableRng};
use rand_chacha::ChaCha20Rng;
use sc_network::{config::parse_addr, Multiaddr};
use sp_application_crypto::{AppCrypto, ByteArray};
use sp_keystore::{Keystore, KeystorePtr};
use pezkuwi_node_network_protocol::{
authority_discovery::AuthorityDiscovery, peer_set::PeerSet, GossipSupportNetworkMessage,
PeerId, ValidationProtocols,
};
use pezkuwi_node_subsystem::{
messages::{
ChainApiMessage, GossipSupportMessage, NetworkBridgeEvent, NetworkBridgeRxMessage,
NetworkBridgeTxMessage, RuntimeApiMessage, RuntimeApiRequest,
},
overseer, ActiveLeavesUpdate, FromOrchestra, OverseerSignal, SpawnedSubsystem, SubsystemError,
};
use pezkuwi_node_subsystem_util as util;
use pezkuwi_primitives::{AuthorityDiscoveryId, Hash, SessionIndex, SessionInfo, ValidatorIndex};
#[cfg(test)]
mod tests;
mod metrics;
use metrics::Metrics;
const LOG_TARGET: &str = "teyrchain::gossip-support";
// How much time should we wait to reissue a connection request
// since the last authority discovery resolution failure.
#[cfg(not(test))]
const BACKOFF_DURATION: Duration = Duration::from_secs(5);
#[cfg(test)]
const BACKOFF_DURATION: Duration = Duration::from_millis(500);
// The authorithy_discovery queries runs every ten minutes,
// so it make sense to run a bit more often than that to
// detect changes as often as we can, but not too often since
// it won't help.
#[cfg(not(test))]
const TRY_RERESOLVE_AUTHORITIES: Duration = Duration::from_secs(5 * 60);
#[cfg(test)]
const TRY_RERESOLVE_AUTHORITIES: Duration = Duration::from_secs(2);
/// Duration after which we consider low connectivity a problem.
///
/// Especially at startup low connectivity is expected (authority discovery cache needs to be
/// populated). Authority discovery on Kusama takes around 8 minutes, so warning after 10 minutes
/// should be fine:
///
/// https://github.com/paritytech/substrate/blob/fc49802f263529160635471c8a17888846035f5d/client/authority-discovery/src/lib.rs#L88
const LOW_CONNECTIVITY_WARN_DELAY: Duration = Duration::from_secs(600);
/// If connectivity is lower than this in percent, issue warning in logs.
const LOW_CONNECTIVITY_WARN_THRESHOLD: usize = 85;
/// The Gossip Support subsystem.
pub struct GossipSupport<AD> {
keystore: KeystorePtr,
last_session_index: Option<SessionIndex>,
/// Whether we are currently an authority or not.
is_authority_now: bool,
/// The minimum known session we build the topology for.
min_known_session: SessionIndex,
// Some(timestamp) if we failed to resolve
// at least a third of authorities the last time.
// `None` otherwise.
last_failure: Option<Instant>,
// Validators can restart during a session, so if they change
// their PeerID, we will connect to them in the best case after
// a session, so we need to try more often to resolved peers and
// reconnect to them. The authorithy_discovery queries runs every ten
// minutes, so we can't detect changes in the address more often
// that that.
last_connection_request: Option<Instant>,
/// First time we did not reach our connectivity threshold.
///
/// This is the time of the first failed attempt to connect to >2/3 of all validators in a
/// potential sequence of failed attempts. It will be cleared once we reached >2/3
/// connectivity.
failure_start: Option<Instant>,
/// Successfully resolved connections
///
/// waiting for actual connection.
resolved_authorities: HashMap<AuthorityDiscoveryId, HashSet<Multiaddr>>,
/// Actually connected authorities.
connected_authorities: HashMap<AuthorityDiscoveryId, PeerId>,
/// By `PeerId`.
///
/// Needed for efficient handling of disconnect events.
connected_peers: HashMap<PeerId, HashSet<AuthorityDiscoveryId>>,
/// Authority discovery service.
authority_discovery: AD,
/// The oldest session we need to build a topology for because
/// the finalized blocks are from a session we haven't built a topology for.
finalized_needed_session: Option<u32>,
/// Subsystem metrics.
metrics: Metrics,
}
#[overseer::contextbounds(GossipSupport, prefix = self::overseer)]
impl<AD> GossipSupport<AD>
where
AD: AuthorityDiscovery,
{
/// Create a new instance of the [`GossipSupport`] subsystem.
pub fn new(keystore: KeystorePtr, authority_discovery: AD, metrics: Metrics) -> Self {
// Initialize metrics to `0`.
metrics.on_is_not_authority();
metrics.on_is_not_teyrchain_validator();
Self {
keystore,
last_session_index: None,
last_failure: None,
last_connection_request: None,
failure_start: None,
resolved_authorities: HashMap::new(),
connected_authorities: HashMap::new(),
connected_peers: HashMap::new(),
min_known_session: u32::MAX,
authority_discovery,
finalized_needed_session: None,
is_authority_now: false,
metrics,
}
}
async fn run<Context>(mut self, mut ctx: Context) -> Self {
fn get_connectivity_check_delay() -> Delay {
Delay::new(LOW_CONNECTIVITY_WARN_DELAY)
}
let mut next_connectivity_check = get_connectivity_check_delay().fuse();
loop {
let message = select!(
_ = next_connectivity_check => {
self.check_connectivity();
next_connectivity_check = get_connectivity_check_delay().fuse();
continue
}
result = ctx.recv().fuse() =>
match result {
Ok(message) => message,
Err(e) => {
gum::debug!(
target: LOG_TARGET,
err = ?e,
"Failed to receive a message from Overseer, exiting",
);
return self
},
}
);
match message {
FromOrchestra::Communication {
msg: GossipSupportMessage::NetworkBridgeUpdate(ev),
} => self.handle_connect_disconnect(ev),
FromOrchestra::Signal(OverseerSignal::ActiveLeaves(ActiveLeavesUpdate {
activated,
..
})) => {
gum::trace!(target: LOG_TARGET, "active leaves signal");
let leaves = activated.into_iter().map(|a| a.hash);
if let Err(e) = self.handle_active_leaves(ctx.sender(), leaves).await {
gum::debug!(target: LOG_TARGET, error = ?e);
}
},
FromOrchestra::Signal(OverseerSignal::BlockFinalized(_hash, _number)) =>
if let Some(session_index) = self.last_session_index {
if let Err(e) = self
.build_topology_for_last_finalized_if_needed(
ctx.sender(),
session_index,
)
.await
{
gum::warn!(
target: LOG_TARGET,
"Failed to build topology for last finalized session: {:?}",
e
);
}
},
FromOrchestra::Signal(OverseerSignal::Conclude) => return self,
}
}
}
/// 1. Determine if the current session index has changed.
/// 2. If it has, determine relevant validators and issue a connection request.
async fn handle_active_leaves(
&mut self,
sender: &mut impl overseer::GossipSupportSenderTrait,
leaves: impl Iterator<Item = Hash>,
) -> Result<(), util::Error> {
for leaf in leaves {
let current_index = util::request_session_index_for_child(leaf, sender).await.await??;
let since_failure = self.last_failure.map(|i| i.elapsed()).unwrap_or_default();
let since_last_reconnect =
self.last_connection_request.map(|i| i.elapsed()).unwrap_or_default();
let force_request = since_failure >= BACKOFF_DURATION;
let re_resolve_authorities = since_last_reconnect >= TRY_RERESOLVE_AUTHORITIES;
let leaf_session = Some((current_index, leaf));
let maybe_new_session = match self.last_session_index {
Some(i) if current_index <= i => None,
_ => leaf_session,
};
let maybe_issue_connection = if force_request || re_resolve_authorities {
leaf_session
} else {
maybe_new_session
};
if let Some((session_index, relay_parent)) = maybe_issue_connection {
let session_info =
util::request_session_info(leaf, session_index, sender).await.await??;
let session_info = match session_info {
Some(s) => s,
None => {
gum::warn!(
relay_parent = ?leaf,
session_index = self.last_session_index,
"Failed to get session info.",
);
continue;
},
};
// Note: we only update `last_session_index` once we've
// successfully gotten the `SessionInfo`.
let is_new_session = maybe_new_session.is_some();
if is_new_session {
gum::debug!(
target: LOG_TARGET,
%session_index,
"New session detected",
);
self.last_session_index = Some(session_index);
self.is_authority_now =
ensure_i_am_an_authority(&self.keystore, &session_info.discovery_keys)
.is_ok();
}
// Connect to authorities from the past/present/future.
//
// This is maybe not the right place for this logic to live,
// but at the moment we're limited by the network bridge's ability
// to handle connection requests (it only allows one, globally).
//
// Certain network protocols - mostly req/res, but some gossip,
// will require being connected to past/future validators as well
// as current. That is, the old authority sets are not made obsolete
// by virtue of a new session being entered. Therefore we maintain
// connections to a much broader set of validators.
{
let mut connections = authorities_past_present_future(sender, leaf).await?;
self.last_connection_request = Some(Instant::now());
// Remove all of our locally controlled validator indices so we don't connect to
// ourself.
let connections =
if remove_all_controlled(&self.keystore, &mut connections) != 0 {
connections
} else {
// If we control none of them, issue an empty connection request
// to clean up all connections.
Vec::new()
};
if force_request || is_new_session {
self.issue_connection_request(sender, connections).await;
} else if re_resolve_authorities {
self.issue_connection_request_to_changed(sender, connections).await;
}
}
if is_new_session {
if let Err(err) = self
.build_topology_for_last_finalized_if_needed(sender, session_index)
.await
{
gum::warn!(
target: LOG_TARGET,
"Failed to build topology for last finalized session: {:?}",
err
);
}
// Gossip topology is only relevant for authorities in the current session.
let our_index = self.get_key_index_and_update_metrics(&session_info)?;
update_gossip_topology(
sender,
our_index,
session_info.discovery_keys.clone(),
relay_parent,
session_index,
)
.await?;
}
// authority_discovery is just a cache so let's try every time we try to re-connect
// if new authorities are present.
self.update_authority_ids(sender, session_info.discovery_keys).await;
}
}
Ok(())
}
/// Build the gossip topology for the session of the last finalized block if we haven't built
/// one.
///
/// This is needed to ensure that if finality is lagging accross session boundary and a restart
/// happens after the new session started, we built a topology from the session we haven't
/// finalized the blocks yet.
/// Once finalized blocks start to be from a session we've built a topology for, we can stop.
async fn build_topology_for_last_finalized_if_needed(
&mut self,
sender: &mut impl overseer::GossipSupportSenderTrait,
current_session_index: u32,
) -> Result<(), util::Error> {
self.min_known_session = self.min_known_session.min(current_session_index);
if self
.finalized_needed_session
.map(|oldest_needed_session| oldest_needed_session < self.min_known_session)
.unwrap_or(true)
{
let (tx, rx) = oneshot::channel();
sender.send_message(ChainApiMessage::FinalizedBlockNumber(tx)).await;
let finalized_block_number = match rx.await? {
Ok(block_number) => block_number,
_ => return Ok(()),
};
let (tx, rx) = oneshot::channel();
sender
.send_message(ChainApiMessage::FinalizedBlockHash(finalized_block_number, tx))
.await;
let finalized_block_hash = match rx.await? {
Ok(Some(block_hash)) => block_hash,
_ => return Ok(()),
};
let finalized_session_index =
util::request_session_index_for_child(finalized_block_hash, sender)
.await
.await??;
if finalized_session_index < self.min_known_session &&
Some(finalized_session_index) != self.finalized_needed_session
{
gum::debug!(
target: LOG_TARGET,
?finalized_block_hash,
?finalized_block_number,
?finalized_session_index,
"Building topology for finalized block session",
);
let finalized_session_info = match util::request_session_info(
finalized_block_hash,
finalized_session_index,
sender,
)
.await
.await??
{
Some(session_info) => session_info,
_ => return Ok(()),
};
let our_index = self.get_key_index_and_update_metrics(&finalized_session_info)?;
update_gossip_topology(
sender,
our_index,
finalized_session_info.discovery_keys.clone(),
finalized_block_hash,
finalized_session_index,
)
.await?;
}
self.finalized_needed_session = Some(finalized_session_index);
}
Ok(())
}
// Checks if the node is an authority and also updates `pezkuwi_node_is_authority` and
// `pezkuwi_node_is_teyrchain_validator` metrics accordingly.
// On success, returns the index of our keys in `session_info.discovery_keys`.
fn get_key_index_and_update_metrics(
&mut self,
session_info: &SessionInfo,
) -> Result<usize, util::Error> {
let authority_check_result =
ensure_i_am_an_authority(&self.keystore, &session_info.discovery_keys);
match authority_check_result.as_ref() {
Ok(index) => {
gum::trace!(target: LOG_TARGET, "We are now an authority",);
self.metrics.on_is_authority();
// The subset of authorities participating in teyrchain consensus.
let teyrchain_validators_this_session = session_info.validators.len();
// First `maxValidators` entries are the teyrchain validators. We'll check
// if our index is in this set to avoid searching for the keys.
// https://github.com/paritytech/polkadot/blob/a52dca2be7840b23c19c153cf7e110b1e3e475f8/runtime/parachains/src/configuration.rs#L148
if *index < teyrchain_validators_this_session {
gum::trace!(target: LOG_TARGET, "We are now a teyrchain validator",);
self.metrics.on_is_teyrchain_validator();
} else {
gum::trace!(target: LOG_TARGET, "We are no longer a teyrchain validator",);
self.metrics.on_is_not_teyrchain_validator();
}
},
Err(util::Error::NotAValidator) => {
gum::trace!(target: LOG_TARGET, "We are no longer an authority",);
self.metrics.on_is_not_authority();
self.metrics.on_is_not_teyrchain_validator();
},
// Don't update on runtime errors.
Err(_) => {},
};
authority_check_result
}
async fn resolve_authorities(
&mut self,
authorities: Vec<AuthorityDiscoveryId>,
) -> (Vec<HashSet<Multiaddr>>, HashMap<AuthorityDiscoveryId, HashSet<Multiaddr>>, usize) {
let mut validator_addrs = Vec::with_capacity(authorities.len());
let mut resolved = HashMap::with_capacity(authorities.len());
let mut failures = 0;
for authority in authorities {
if let Some(addrs) =
self.authority_discovery.get_addresses_by_authority_id(authority.clone()).await
{
validator_addrs.push(addrs.clone());
resolved.insert(authority, addrs);
} else {
failures += 1;
gum::debug!(
target: LOG_TARGET,
"Couldn't resolve addresses of authority: {:?}",
authority
);
}
}
(validator_addrs, resolved, failures)
}
async fn issue_connection_request_to_changed<Sender>(
&mut self,
sender: &mut Sender,
authorities: Vec<AuthorityDiscoveryId>,
) where
Sender: overseer::GossipSupportSenderTrait,
{
let (_, resolved, _) = self.resolve_authorities(authorities).await;
let mut changed = Vec::new();
for (authority, new_addresses) in &resolved {
let new_peer_ids = new_addresses
.iter()
.flat_map(|addr| parse_addr(addr.clone()).ok().map(|(p, _)| p))
.collect::<HashSet<_>>();
match self.resolved_authorities.get(authority) {
Some(old_addresses) => {
let old_peer_ids = old_addresses
.iter()
.flat_map(|addr| parse_addr(addr.clone()).ok().map(|(p, _)| p))
.collect::<HashSet<_>>();
if !old_peer_ids.is_superset(&new_peer_ids) {
changed.push(new_addresses.clone());
}
},
None => changed.push(new_addresses.clone()),
}
}
gum::debug!(
target: LOG_TARGET,
num_changed = ?changed.len(),
?changed,
"Issuing a connection request to changed validators"
);
if !changed.is_empty() {
self.resolved_authorities = resolved;
sender
.send_message(NetworkBridgeTxMessage::AddToResolvedValidators {
validator_addrs: changed,
peer_set: PeerSet::Validation,
})
.await;
}
}
async fn issue_connection_request<Sender>(
&mut self,
sender: &mut Sender,
authorities: Vec<AuthorityDiscoveryId>,
) where
Sender: overseer::GossipSupportSenderTrait,
{
let num = authorities.len();
let (validator_addrs, resolved, failures) = self.resolve_authorities(authorities).await;
self.resolved_authorities = resolved;
gum::debug!(target: LOG_TARGET, %num, "Issuing a connection request");
sender
.send_message(NetworkBridgeTxMessage::ConnectToResolvedValidators {
validator_addrs,
peer_set: PeerSet::Validation,
})
.await;
// issue another request for the same session
// if at least a third of the authorities were not resolved.
if num != 0 && 3 * failures >= num {
let timestamp = Instant::now();
match self.failure_start {
None => self.failure_start = Some(timestamp),
Some(first) if first.elapsed() >= LOW_CONNECTIVITY_WARN_DELAY => {
gum::warn!(
target: LOG_TARGET,
connected = ?(num - failures),
target = ?num,
"Low connectivity - authority lookup failed for too many validators."
);
},
Some(_) => {
gum::debug!(
target: LOG_TARGET,
connected = ?(num - failures),
target = ?num,
"Low connectivity (due to authority lookup failures) - expected on startup."
);
},
}
self.last_failure = Some(timestamp);
} else {
self.last_failure = None;
self.failure_start = None;
};
}
async fn update_authority_ids<Sender>(
&mut self,
sender: &mut Sender,
authorities: Vec<AuthorityDiscoveryId>,
) where
Sender: overseer::GossipSupportSenderTrait,
{
let mut authority_ids: HashMap<PeerId, HashSet<AuthorityDiscoveryId>> = HashMap::new();
for authority in authorities {
let peer_ids = self
.authority_discovery
.get_addresses_by_authority_id(authority.clone())
.await
.into_iter()
.flat_map(|list| list.into_iter())
.flat_map(|addr| parse_addr(addr).ok().map(|(p, _)| p))
.collect::<HashSet<_>>();
gum::trace!(
target: LOG_TARGET,
?peer_ids,
?authority,
"Resolved to peer ids"
);
for p in peer_ids {
authority_ids.entry(p).or_default().insert(authority.clone());
}
}
// peer was authority and now isn't
for (peer_id, current) in self.connected_peers.iter_mut() {
// empty -> nonempty is handled in the next loop
if !current.is_empty() && !authority_ids.contains_key(peer_id) {
sender
.send_message(NetworkBridgeRxMessage::UpdatedAuthorityIds {
peer_id: *peer_id,
authority_ids: HashSet::new(),
})
.await;
for a in current.drain() {
self.connected_authorities.remove(&a);
}
}
}
// peer has new authority set.
for (peer_id, new) in authority_ids {
// If the peer is connected _and_ the authority IDs have changed.
if let Some(prev) = self.connected_peers.get(&peer_id).filter(|x| x != &&new) {
sender
.send_message(NetworkBridgeRxMessage::UpdatedAuthorityIds {
peer_id,
authority_ids: new.clone(),
})
.await;
prev.iter().for_each(|a| {
self.connected_authorities.remove(a);
});
new.iter().for_each(|a| {
self.connected_authorities.insert(a.clone(), peer_id);
});
self.connected_peers.insert(peer_id, new);
}
}
}
fn handle_connect_disconnect(&mut self, ev: NetworkBridgeEvent<GossipSupportNetworkMessage>) {
match ev {
NetworkBridgeEvent::PeerConnected(peer_id, _, _, o_authority) => {
if let Some(authority_ids) = o_authority {
authority_ids.iter().for_each(|a| {
self.connected_authorities.insert(a.clone(), peer_id);
});
self.connected_peers.insert(peer_id, authority_ids);
} else {
self.connected_peers.insert(peer_id, HashSet::new());
}
},
NetworkBridgeEvent::PeerDisconnected(peer_id) => {
if let Some(authority_ids) = self.connected_peers.remove(&peer_id) {
authority_ids.into_iter().for_each(|a| {
self.connected_authorities.remove(&a);
});
}
},
NetworkBridgeEvent::UpdatedAuthorityIds(_, _) => {
// The `gossip-support` subsystem itself issues these messages.
},
NetworkBridgeEvent::OurViewChange(_) => {},
NetworkBridgeEvent::PeerViewChange(_, _) => {},
NetworkBridgeEvent::NewGossipTopology { .. } => {},
NetworkBridgeEvent::PeerMessage(_, message) => {
// match void -> LLVM unreachable
match message {
ValidationProtocols::V3(m) => match m {},
}
},
}
}
/// Check connectivity and report on it in logs.
fn check_connectivity(&mut self) {
let absolute_connected = self.connected_authorities.len();
let absolute_resolved = self.resolved_authorities.len();
let connected_ratio =
(100 * absolute_connected).checked_div(absolute_resolved).unwrap_or(100);
let unconnected_authorities = self
.resolved_authorities
.iter()
.filter(|(a, _)| !self.connected_authorities.contains_key(a));
if connected_ratio <= LOW_CONNECTIVITY_WARN_THRESHOLD && self.is_authority_now {
gum::error!(
target: LOG_TARGET,
session_index = self.last_session_index.as_ref().map(|s| *s).unwrap_or_default(),
"Connectivity seems low, we are only connected to {connected_ratio}% of available validators (see debug logs for details), if this persists more than a session action needs to be taken"
);
}
let pretty = PrettyAuthorities(unconnected_authorities);
gum::debug!(
target: LOG_TARGET,
?connected_ratio,
?absolute_connected,
?absolute_resolved,
unconnected_authorities = %pretty,
"Connectivity Report"
);
}
}
// Get the authorities of the past, present, and future.
async fn authorities_past_present_future(
sender: &mut impl overseer::GossipSupportSenderTrait,
relay_parent: Hash,
) -> Result<Vec<AuthorityDiscoveryId>, util::Error> {
let authorities = util::request_authorities(relay_parent, sender).await.await??;
gum::debug!(
target: LOG_TARGET,
authority_count = ?authorities.len(),
"Determined past/present/future authorities",
);
Ok(authorities)
}
/// Return an error if we're not a validator in the given set (do not have keys).
/// Otherwise, returns the index of our keys in `authorities`.
fn ensure_i_am_an_authority(
keystore: &KeystorePtr,
authorities: &[AuthorityDiscoveryId],
) -> Result<usize, util::Error> {
for (i, v) in authorities.iter().enumerate() {
if Keystore::has_keys(&**keystore, &[(v.to_raw_vec(), AuthorityDiscoveryId::ID)]) {
return Ok(i);
}
}
Err(util::Error::NotAValidator)
}
/// Filter out all controlled keys in the given set. Returns the number of keys removed.
fn remove_all_controlled(
keystore: &KeystorePtr,
authorities: &mut Vec<AuthorityDiscoveryId>,
) -> usize {
let mut to_remove = Vec::new();
for (i, v) in authorities.iter().enumerate() {
if Keystore::has_keys(&**keystore, &[(v.to_raw_vec(), AuthorityDiscoveryId::ID)]) {
to_remove.push(i);
}
}
for i in to_remove.iter().rev().copied() {
authorities.remove(i);
}
to_remove.len()
}
/// We partition the list of all sorted `authorities` into `sqrt(len)` groups of `sqrt(len)` size
/// and form a matrix where each validator is connected to all validators in its row and column.
/// This is similar to `[web3]` research proposed topology, except for the groups are not teyrchain
/// groups (because not all validators are teyrchain validators and the group size is small),
/// but formed randomly via BABE randomness from two epochs ago.
/// This limits the amount of gossip peers to 2 * `sqrt(len)` and ensures the diameter of 2.
///
/// [web3]: https://research.web3.foundation/en/latest/polkadot/networking/3-avail-valid.html#topology
async fn update_gossip_topology(
sender: &mut impl overseer::GossipSupportSenderTrait,
our_index: usize,
authorities: Vec<AuthorityDiscoveryId>,
relay_parent: Hash,
session_index: SessionIndex,
) -> Result<(), util::Error> {
// retrieve BABE randomness
let random_seed = {
let (tx, rx) = oneshot::channel();
// TODO https://github.com/paritytech/polkadot/issues/5316:
// get the random seed from the `SessionInfo` instead.
sender
.send_message(RuntimeApiMessage::Request(
relay_parent,
RuntimeApiRequest::CurrentBabeEpoch(tx),
))
.await;
let randomness = rx.await??.randomness;
let mut subject = [0u8; 40];
subject[..8].copy_from_slice(b"gossipsu");
subject[8..].copy_from_slice(&randomness);
sp_crypto_hashing::blake2_256(&subject)
};
// shuffle the validators and create the index mapping
let (shuffled_indices, canonical_shuffling) = {
let mut rng: ChaCha20Rng = SeedableRng::from_seed(random_seed);
let len = authorities.len();
let mut shuffled_indices = vec![0; len];
let mut canonical_shuffling: Vec<_> = authorities
.iter()
.enumerate()
.map(|(i, a)| (a.clone(), ValidatorIndex(i as _)))
.collect();
fisher_yates_shuffle(&mut rng, &mut canonical_shuffling[..]);
for (i, (_, validator_index)) in canonical_shuffling.iter().enumerate() {
shuffled_indices[validator_index.0 as usize] = i;
}
(shuffled_indices, canonical_shuffling)
};
sender
.send_message(NetworkBridgeRxMessage::NewGossipTopology {
session: session_index,
local_index: Some(ValidatorIndex(our_index as _)),
canonical_shuffling,
shuffled_indices,
})
.await;
Ok(())
}
// Durstenfeld algorithm for the Fisher-Yates shuffle
// https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle#The_modern_algorithm
fn fisher_yates_shuffle<T, R: Rng + ?Sized>(rng: &mut R, items: &mut [T]) {
for i in (1..items.len()).rev() {
// invariant: elements with index > i have been locked in place.
let index = rng.gen_range(0u32..(i as u32 + 1));
items.swap(i, index as usize);
}
}
#[overseer::subsystem(GossipSupport, error = SubsystemError, prefix = self::overseer)]
impl<Context, AD> GossipSupport<AD>
where
AD: AuthorityDiscovery + Clone,
{
fn start(self, ctx: Context) -> SpawnedSubsystem {
let future = self.run(ctx).map(|_| Ok(())).boxed();
SpawnedSubsystem { name: "gossip-support-subsystem", future }
}
}
/// Helper struct to get a nice rendering of unreachable authorities.
struct PrettyAuthorities<I>(I);
impl<'a, I> fmt::Display for PrettyAuthorities<I>
where
I: Iterator<Item = (&'a AuthorityDiscoveryId, &'a HashSet<Multiaddr>)> + Clone,
{
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let mut authorities = self.0.clone().peekable();
if authorities.peek().is_none() {
write!(f, "None")?;
} else {
write!(f, "\n")?;
}
for (authority, addrs) in authorities {
write!(f, "{}:\n", authority)?;
for addr in addrs {
write!(f, " {}\n", addr)?;
}
write!(f, "\n")?;
}
Ok(())
}
}
@@ -0,0 +1,91 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use pezkuwi_node_subsystem_util::{
metrics,
metrics::{
prometheus,
prometheus::{Gauge, PrometheusError, Registry, U64},
},
};
/// Dispute Distribution metrics.
#[derive(Clone, Default)]
pub struct Metrics(Option<MetricsInner>);
#[derive(Clone)]
struct MetricsInner {
/// Tracks authority status for producing relay chain blocks.
is_authority: Gauge<U64>,
/// Tracks authority status for teyrchain approval checking.
is_teyrchain_validator: Gauge<U64>,
}
impl Metrics {
/// Dummy constructor for testing.
#[cfg(test)]
pub fn new_dummy() -> Self {
Self(None)
}
/// Set the `relaychain validator` metric.
pub fn on_is_authority(&self) {
if let Some(metrics) = &self.0 {
metrics.is_authority.set(1);
}
}
/// Unset the `relaychain validator` metric.
pub fn on_is_not_authority(&self) {
if let Some(metrics) = &self.0 {
metrics.is_authority.set(0);
}
}
/// Set the `teyrchain validator` metric.
pub fn on_is_teyrchain_validator(&self) {
if let Some(metrics) = &self.0 {
metrics.is_teyrchain_validator.set(1);
}
}
/// Unset the `teyrchain validator` metric.
pub fn on_is_not_teyrchain_validator(&self) {
if let Some(metrics) = &self.0 {
metrics.is_teyrchain_validator.set(0);
}
}
}
impl metrics::Metrics for Metrics {
fn try_register(registry: &Registry) -> Result<Self, PrometheusError> {
let metrics = MetricsInner {
is_authority: prometheus::register(
Gauge::new("pezkuwi_node_is_active_validator", "Tracks if the validator is in the active set. \
Updates at session boundary.")?,
registry,
)?,
is_teyrchain_validator: prometheus::register(
Gauge::new("pezkuwi_node_is_teyrchain_validator",
"Tracks if the validator participates in teyrchain consensus. Teyrchain validators are a \
subset of the active set validators that perform approval checking of all teyrchain candidates in a session.\
Updates at session boundary.")?,
registry,
)?,
};
Ok(Metrics(Some(metrics)))
}
}
File diff suppressed because it is too large Load Diff
+45
View File
@@ -0,0 +1,45 @@
[package]
name = "pezkuwi-node-network-protocol"
version = "7.0.0"
authors.workspace = true
edition.workspace = true
license.workspace = true
description = "Primitives types for the Node-side"
homepage.workspace = true
repository.workspace = true
[lints]
workspace = true
[dependencies]
async-channel = { workspace = true }
async-trait = { workspace = true }
bitvec = { workspace = true, default-features = true }
codec = { features = ["derive"], workspace = true }
derive_more = { workspace = true, default-features = true }
fatality = { workspace = true }
futures = { workspace = true }
gum = { workspace = true, default-features = true }
hex = { workspace = true, default-features = true }
pezkuwi-node-primitives = { workspace = true, default-features = true }
pezkuwi-primitives = { workspace = true, default-features = true }
rand = { workspace = true, default-features = true }
sc-authority-discovery = { workspace = true, default-features = true }
sc-network = { workspace = true, default-features = true }
sc-network-types = { workspace = true, default-features = true }
sp-runtime = { workspace = true, default-features = true }
strum = { features = ["derive"], workspace = true, default-features = true }
thiserror = { workspace = true }
[dev-dependencies]
rand_chacha = { workspace = true, default-features = true }
[features]
runtime-benchmarks = [
"gum/runtime-benchmarks",
"pezkuwi-node-primitives/runtime-benchmarks",
"pezkuwi-primitives/runtime-benchmarks",
"sc-authority-discovery/runtime-benchmarks",
"sc-network/runtime-benchmarks",
"sp-runtime/runtime-benchmarks",
]
@@ -0,0 +1,61 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//! Authority discovery service interfacing.
use std::{collections::HashSet, fmt::Debug};
use async_trait::async_trait;
use sc_authority_discovery::Service as AuthorityDiscoveryService;
use pezkuwi_primitives::AuthorityDiscoveryId;
use sc_network::Multiaddr;
use sc_network_types::PeerId;
/// An abstraction over the authority discovery service.
///
/// Needed for mocking in tests mostly.
#[async_trait]
pub trait AuthorityDiscovery: Send + Debug + 'static {
/// Get the addresses for the given [`AuthorityDiscoveryId`] from the local address cache.
async fn get_addresses_by_authority_id(
&mut self,
authority: AuthorityDiscoveryId,
) -> Option<HashSet<Multiaddr>>;
/// Get the [`AuthorityDiscoveryId`] for the given [`PeerId`] from the local address cache.
async fn get_authority_ids_by_peer_id(
&mut self,
peer_id: PeerId,
) -> Option<HashSet<AuthorityDiscoveryId>>;
}
#[async_trait]
impl AuthorityDiscovery for AuthorityDiscoveryService {
async fn get_addresses_by_authority_id(
&mut self,
authority: AuthorityDiscoveryId,
) -> Option<HashSet<Multiaddr>> {
AuthorityDiscoveryService::get_addresses_by_authority_id(self, authority).await
}
async fn get_authority_ids_by_peer_id(
&mut self,
peer_id: PeerId,
) -> Option<HashSet<AuthorityDiscoveryId>> {
AuthorityDiscoveryService::get_authority_ids_by_peer_id(self, peer_id).await
}
}
@@ -0,0 +1,738 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//! Grid topology support implementation
//! The basic operation of the 2D grid topology is that:
//! * A validator producing a message sends it to its row-neighbors and its column-neighbors
//! * A validator receiving a message originating from one of its row-neighbors sends it to its
//! column-neighbors
//! * A validator receiving a message originating from one of its column-neighbors sends it to its
//! row-neighbors
//!
//! This grid approach defines 2 unique paths for every validator to reach every other validator in
//! at most 2 hops.
//!
//! However, we also supplement this with some degree of random propagation:
//! every validator, upon seeing a message for the first time, propagates it to 8 random peers.
//! This inserts some redundancy in case the grid topology isn't working or is being attacked -
//! an adversary doesn't know which peers a validator will send to.
//! This is combined with the property that the adversary doesn't know which validators will elect
//! to check a block.
use crate::PeerId;
use pezkuwi_primitives::{AuthorityDiscoveryId, SessionIndex, ValidatorIndex};
use rand::{CryptoRng, Rng};
use std::{
collections::{hash_map, HashMap, HashSet},
fmt::Debug,
};
const LOG_TARGET: &str = "teyrchain::grid-topology";
/// The sample rate for randomly propagating messages. This
/// reduces the left tail of the binomial distribution but also
/// introduces a bias towards peers who we sample before others
/// (i.e. those who get a block before others).
pub const DEFAULT_RANDOM_SAMPLE_RATE: usize = crate::MIN_GOSSIP_PEERS;
/// The number of peers to randomly propagate messages to.
pub const DEFAULT_RANDOM_CIRCULATION: usize = 4;
/// Information about a peer in the gossip topology for a session.
#[derive(Debug, Clone, PartialEq)]
pub struct TopologyPeerInfo {
/// The validator's known peer IDs.
pub peer_ids: Vec<PeerId>,
/// The index of the validator in the discovery keys of the corresponding
/// `SessionInfo`. This can extend _beyond_ the set of active teyrchain validators.
pub validator_index: ValidatorIndex,
/// The authority discovery public key of the validator in the corresponding
/// `SessionInfo`.
pub discovery_id: AuthorityDiscoveryId,
}
/// Topology representation for a session.
#[derive(Default, Clone, Debug, PartialEq)]
pub struct SessionGridTopology {
/// An array mapping validator indices to their indices in the
/// shuffling itself. This has the same size as the number of validators
/// in the session.
shuffled_indices: Vec<usize>,
/// The canonical shuffling of validators for the session.
canonical_shuffling: Vec<TopologyPeerInfo>,
/// The list of peer-ids in an efficient way to search.
peer_ids: HashSet<PeerId>,
}
impl SessionGridTopology {
/// Create a new session grid topology.
pub fn new(shuffled_indices: Vec<usize>, canonical_shuffling: Vec<TopologyPeerInfo>) -> Self {
let mut peer_ids = HashSet::new();
for peer_info in canonical_shuffling.iter() {
for peer_id in peer_info.peer_ids.iter() {
peer_ids.insert(*peer_id);
}
}
SessionGridTopology { shuffled_indices, canonical_shuffling, peer_ids }
}
/// Updates the known peer ids for the passed authorities ids.
pub fn update_authority_ids(
&mut self,
peer_id: PeerId,
ids: &HashSet<AuthorityDiscoveryId>,
) -> bool {
let mut updated = false;
if !self.peer_ids.contains(&peer_id) {
for peer in self
.canonical_shuffling
.iter_mut()
.filter(|peer| ids.contains(&peer.discovery_id))
{
peer.peer_ids.push(peer_id);
self.peer_ids.insert(peer_id);
updated = true;
}
}
updated
}
/// Produces the outgoing routing logic for a particular peer.
///
/// Returns `None` if the validator index is out of bounds.
pub fn compute_grid_neighbors_for(&self, v: ValidatorIndex) -> Option<GridNeighbors> {
if self.shuffled_indices.len() != self.canonical_shuffling.len() {
return None;
}
let shuffled_val_index = *self.shuffled_indices.get(v.0 as usize)?;
let neighbors = matrix_neighbors(shuffled_val_index, self.shuffled_indices.len())?;
let mut grid_subset = GridNeighbors::empty();
for r_n in neighbors.row_neighbors {
let n = &self.canonical_shuffling[r_n];
grid_subset.validator_indices_x.insert(n.validator_index);
for p in &n.peer_ids {
grid_subset.peers_x.insert(*p);
}
}
for c_n in neighbors.column_neighbors {
let n = &self.canonical_shuffling[c_n];
grid_subset.validator_indices_y.insert(n.validator_index);
for p in &n.peer_ids {
grid_subset.peers_y.insert(*p);
}
}
Some(grid_subset)
}
/// Tells if a given peer id is validator in a session
pub fn is_validator(&self, peer: &PeerId) -> bool {
self.peer_ids.contains(peer)
}
}
struct MatrixNeighbors<R, C> {
row_neighbors: R,
column_neighbors: C,
}
/// Compute the row and column neighbors of `val_index` in a matrix
fn matrix_neighbors(
val_index: usize,
len: usize,
) -> Option<MatrixNeighbors<impl Iterator<Item = usize>, impl Iterator<Item = usize>>> {
if val_index >= len {
return None;
}
// e.g. for size 11 the matrix would be
//
// 0 1 2
// 3 4 5
// 6 7 8
// 9 10
//
// and for index 10, the neighbors would be 1, 4, 7, 9
let sqrt = (len as f64).sqrt() as usize;
let our_row = val_index / sqrt;
let our_column = val_index % sqrt;
let row_neighbors = our_row * sqrt..std::cmp::min(our_row * sqrt + sqrt, len);
let column_neighbors = (our_column..len).step_by(sqrt);
Some(MatrixNeighbors {
row_neighbors: row_neighbors.filter(move |i| *i != val_index),
column_neighbors: column_neighbors.filter(move |i| *i != val_index),
})
}
/// Information about the grid neighbors for a particular node in the topology.
#[derive(Debug, Clone, PartialEq)]
pub struct GridNeighbors {
/// Represent peers in the X axis
pub peers_x: HashSet<PeerId>,
/// Represent validators in the X axis
pub validator_indices_x: HashSet<ValidatorIndex>,
/// Represent peers in the Y axis
pub peers_y: HashSet<PeerId>,
/// Represent validators in the Y axis
pub validator_indices_y: HashSet<ValidatorIndex>,
}
impl GridNeighbors {
/// Utility function for creating an empty set of grid neighbors.
/// Useful for testing.
pub fn empty() -> Self {
GridNeighbors {
peers_x: HashSet::new(),
validator_indices_x: HashSet::new(),
peers_y: HashSet::new(),
validator_indices_y: HashSet::new(),
}
}
/// Given the originator of a message as a validator index, indicates the part of the topology
/// we're meant to send the message to.
pub fn required_routing_by_index(
&self,
originator: ValidatorIndex,
local: bool,
) -> RequiredRouting {
if local {
return RequiredRouting::GridXY;
}
let grid_x = self.validator_indices_x.contains(&originator);
let grid_y = self.validator_indices_y.contains(&originator);
match (grid_x, grid_y) {
(false, false) => RequiredRouting::None,
(true, false) => RequiredRouting::GridY, // messages from X go to Y
(false, true) => RequiredRouting::GridX, // messages from Y go to X
(true, true) => RequiredRouting::GridXY, /* if the grid works as expected, this
* shouldn't happen. */
}
}
/// Given the originator of a message as a peer index, indicates the part of the topology
/// we're meant to send the message to.
pub fn required_routing_by_peer_id(&self, originator: PeerId, local: bool) -> RequiredRouting {
if local {
return RequiredRouting::GridXY;
}
let grid_x = self.peers_x.contains(&originator);
let grid_y = self.peers_y.contains(&originator);
match (grid_x, grid_y) {
(false, false) => RequiredRouting::None,
(true, false) => RequiredRouting::GridY, // messages from X go to Y
(false, true) => RequiredRouting::GridX, // messages from Y go to X
(true, true) => {
gum::debug!(
target: LOG_TARGET,
?originator,
"Grid topology is unexpected, play it safe and send to X AND Y"
);
RequiredRouting::GridXY
}, /* if the grid works as expected, this
* shouldn't happen. */
}
}
/// Get a filter function based on this topology and the required routing
/// which returns `true` for peers that are within the required routing set
/// and false otherwise.
pub fn route_to_peer(&self, required_routing: RequiredRouting, peer: &PeerId) -> bool {
match required_routing {
RequiredRouting::All => true,
RequiredRouting::GridX => self.peers_x.contains(peer),
RequiredRouting::GridY => self.peers_y.contains(peer),
RequiredRouting::GridXY => self.peers_x.contains(peer) || self.peers_y.contains(peer),
RequiredRouting::None | RequiredRouting::PendingTopology => false,
}
}
/// Returns the difference between this and the `other` topology as a vector of peers
pub fn peers_diff(&self, other: &Self) -> Vec<PeerId> {
self.peers_x
.iter()
.chain(self.peers_y.iter())
.filter(|peer_id| !(other.peers_x.contains(peer_id) || other.peers_y.contains(peer_id)))
.cloned()
.collect::<Vec<_>>()
}
/// A convenience method that returns total number of peers in the topology
pub fn len(&self) -> usize {
self.peers_x.len().saturating_add(self.peers_y.len())
}
}
/// An entry tracking a session grid topology and some cached local neighbors.
#[derive(Debug)]
pub struct SessionGridTopologyEntry {
topology: SessionGridTopology,
local_neighbors: GridNeighbors,
local_index: Option<ValidatorIndex>,
}
impl SessionGridTopologyEntry {
/// Access the local grid neighbors.
pub fn local_grid_neighbors(&self) -> &GridNeighbors {
&self.local_neighbors
}
/// Access the local grid neighbors mutably.
pub fn local_grid_neighbors_mut(&mut self) -> &mut GridNeighbors {
&mut self.local_neighbors
}
/// Access the underlying topology.
pub fn get(&self) -> &SessionGridTopology {
&self.topology
}
/// Tells if a given peer id is validator in a session
pub fn is_validator(&self, peer: &PeerId) -> bool {
self.topology.is_validator(peer)
}
/// Returns the list of peers to route based on the required routing.
pub fn peers_to_route(&self, required_routing: RequiredRouting) -> Vec<PeerId> {
match required_routing {
RequiredRouting::All => self.topology.peer_ids.iter().copied().collect(),
RequiredRouting::GridX => self.local_neighbors.peers_x.iter().copied().collect(),
RequiredRouting::GridY => self.local_neighbors.peers_y.iter().copied().collect(),
RequiredRouting::GridXY => self
.local_neighbors
.peers_x
.iter()
.chain(self.local_neighbors.peers_y.iter())
.copied()
.collect(),
RequiredRouting::None | RequiredRouting::PendingTopology => Vec::new(),
}
}
/// Updates the known peer ids for the passed authorities ids.
pub fn update_authority_ids(
&mut self,
peer_id: PeerId,
ids: &HashSet<AuthorityDiscoveryId>,
) -> bool {
let peer_id_updated = self.topology.update_authority_ids(peer_id, ids);
// If we added a new peer id we need to recompute the grid neighbors, so that
// neighbors_x and neighbors_y reflect the right peer ids.
if peer_id_updated {
if let Some(local_index) = self.local_index.as_ref() {
if let Some(new_grid) = self.topology.compute_grid_neighbors_for(*local_index) {
self.local_neighbors = new_grid;
}
}
}
peer_id_updated
}
}
/// A set of topologies indexed by session
#[derive(Default)]
pub struct SessionGridTopologies {
inner: HashMap<SessionIndex, (Option<SessionGridTopologyEntry>, usize)>,
}
impl SessionGridTopologies {
/// Returns a topology for the specific session index
pub fn get_topology(&self, session: SessionIndex) -> Option<&SessionGridTopologyEntry> {
self.inner.get(&session).and_then(|val| val.0.as_ref())
}
/// Updates the known peer ids for the passed authorities ids.
pub fn update_authority_ids(
&mut self,
peer_id: PeerId,
ids: &HashSet<AuthorityDiscoveryId>,
) -> bool {
self.inner
.iter_mut()
.map(|(_, topology)| {
topology.0.as_mut().map(|topology| topology.update_authority_ids(peer_id, ids))
})
.any(|updated| updated.unwrap_or_default())
}
/// Increase references counter for a specific topology
pub fn inc_session_refs(&mut self, session: SessionIndex) {
self.inner.entry(session).or_insert((None, 0)).1 += 1;
}
/// Decrease references counter for a specific topology
pub fn dec_session_refs(&mut self, session: SessionIndex) {
if let hash_map::Entry::Occupied(mut occupied) = self.inner.entry(session) {
occupied.get_mut().1 = occupied.get().1.saturating_sub(1);
if occupied.get().1 == 0 {
let _ = occupied.remove();
}
}
}
/// Insert a new topology, no-op if already present.
pub fn insert_topology(
&mut self,
session: SessionIndex,
topology: SessionGridTopology,
local_index: Option<ValidatorIndex>,
) {
let entry = self.inner.entry(session).or_insert((None, 0));
if entry.0.is_none() {
let local_neighbors = local_index
.and_then(|l| topology.compute_grid_neighbors_for(l))
.unwrap_or_else(GridNeighbors::empty);
entry.0 = Some(SessionGridTopologyEntry { topology, local_neighbors, local_index });
}
}
}
/// A simple storage for a topology and the corresponding session index
#[derive(Debug)]
struct GridTopologySessionBound {
entry: SessionGridTopologyEntry,
session_index: SessionIndex,
}
/// A storage for the current and maybe previous topology
#[derive(Debug)]
pub struct SessionBoundGridTopologyStorage {
current_topology: GridTopologySessionBound,
prev_topology: Option<GridTopologySessionBound>,
}
impl Default for SessionBoundGridTopologyStorage {
fn default() -> Self {
// having this struct be `Default` is objectively stupid
// but used in a few places
SessionBoundGridTopologyStorage {
current_topology: GridTopologySessionBound {
// session 0 is valid so we should use the upper bound
// as the default instead of the lower bound.
session_index: SessionIndex::max_value(),
entry: SessionGridTopologyEntry {
topology: SessionGridTopology {
shuffled_indices: Vec::new(),
canonical_shuffling: Vec::new(),
peer_ids: Default::default(),
},
local_neighbors: GridNeighbors::empty(),
local_index: None,
},
},
prev_topology: None,
}
}
}
impl SessionBoundGridTopologyStorage {
/// Return a grid topology based on the session index:
/// If we need a previous session and it is registered in the storage, then return that session.
/// Otherwise, return a current session to have some grid topology in any case
pub fn get_topology_or_fallback(&self, idx: SessionIndex) -> &SessionGridTopologyEntry {
self.get_topology(idx).unwrap_or(&self.current_topology.entry)
}
/// Return the grid topology for the specific session index, if no such a session is stored
/// returns `None`.
pub fn get_topology(&self, idx: SessionIndex) -> Option<&SessionGridTopologyEntry> {
if let Some(prev_topology) = &self.prev_topology {
if idx == prev_topology.session_index {
return Some(&prev_topology.entry);
}
}
if self.current_topology.session_index == idx {
return Some(&self.current_topology.entry);
}
None
}
/// Update the current topology preserving the previous one
pub fn update_topology(
&mut self,
session_index: SessionIndex,
topology: SessionGridTopology,
local_index: Option<ValidatorIndex>,
) {
let local_neighbors = local_index
.and_then(|l| topology.compute_grid_neighbors_for(l))
.unwrap_or_else(GridNeighbors::empty);
let old_current = std::mem::replace(
&mut self.current_topology,
GridTopologySessionBound {
entry: SessionGridTopologyEntry { topology, local_neighbors, local_index },
session_index,
},
);
self.prev_topology.replace(old_current);
}
/// Returns a current grid topology
pub fn get_current_topology(&self) -> &SessionGridTopologyEntry {
&self.current_topology.entry
}
/// Returns the current session index.
pub fn get_current_session_index(&self) -> SessionIndex {
self.current_topology.session_index
}
/// Access the current grid topology mutably. Dangerous and intended
/// to be used in tests.
pub fn get_current_topology_mut(&mut self) -> &mut SessionGridTopologyEntry {
&mut self.current_topology.entry
}
}
/// A representation of routing based on sample
#[derive(Debug, Clone, Copy)]
pub struct RandomRouting {
/// The number of peers to target.
target: usize,
/// The number of peers this has been sent to.
sent: usize,
/// Sampling rate
sample_rate: usize,
}
impl Default for RandomRouting {
fn default() -> Self {
RandomRouting {
target: DEFAULT_RANDOM_CIRCULATION,
sent: 0_usize,
sample_rate: DEFAULT_RANDOM_SAMPLE_RATE,
}
}
}
impl RandomRouting {
/// Perform random sampling for a specific peer
/// Returns `true` for a lucky peer
pub fn sample(&self, n_peers_total: usize, rng: &mut (impl CryptoRng + Rng)) -> bool {
if n_peers_total == 0 || self.sent >= self.target {
false
} else if self.sample_rate > n_peers_total {
true
} else {
rng.gen_ratio(self.sample_rate as _, n_peers_total as _)
}
}
/// Increase number of messages being sent
pub fn inc_sent(&mut self) {
self.sent += 1
}
/// Returns `true` if we already took all the necessary samples.
pub fn is_complete(&self) -> bool {
self.sent >= self.target
}
}
/// Routing mode
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum RequiredRouting {
/// We don't know yet, because we're waiting for topology info
/// (race condition between learning about the first blocks in a new session
/// and getting the topology for that session)
PendingTopology,
/// Propagate to all peers of any kind.
All,
/// Propagate to all peers sharing either the X or Y dimension of the grid.
GridXY,
/// Propagate to all peers sharing the X dimension of the grid.
GridX,
/// Propagate to all peers sharing the Y dimension of the grid.
GridY,
/// No required propagation.
None,
}
impl RequiredRouting {
/// Whether the required routing set is definitely empty.
pub fn is_empty(self) -> bool {
match self {
RequiredRouting::PendingTopology | RequiredRouting::None => true,
_ => false,
}
}
/// Combine two required routing sets into one that would cover both routing modes.
pub fn combine(self, other: Self) -> Self {
match (self, other) {
(RequiredRouting::All, _) | (_, RequiredRouting::All) => RequiredRouting::All,
(RequiredRouting::GridXY, _) | (_, RequiredRouting::GridXY) => RequiredRouting::GridXY,
(RequiredRouting::GridX, RequiredRouting::GridY) |
(RequiredRouting::GridY, RequiredRouting::GridX) => RequiredRouting::GridXY,
(RequiredRouting::GridX, RequiredRouting::GridX) => RequiredRouting::GridX,
(RequiredRouting::GridY, RequiredRouting::GridY) => RequiredRouting::GridY,
(RequiredRouting::None, RequiredRouting::PendingTopology) |
(RequiredRouting::PendingTopology, RequiredRouting::None) => RequiredRouting::PendingTopology,
(RequiredRouting::None, _) | (RequiredRouting::PendingTopology, _) => other,
(_, RequiredRouting::None) | (_, RequiredRouting::PendingTopology) => self,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use rand::SeedableRng;
use rand_chacha::ChaCha12Rng;
fn dummy_rng() -> ChaCha12Rng {
rand_chacha::ChaCha12Rng::seed_from_u64(12345)
}
#[test]
fn test_required_routing_combine() {
assert_eq!(RequiredRouting::All.combine(RequiredRouting::None), RequiredRouting::All);
assert_eq!(RequiredRouting::All.combine(RequiredRouting::GridXY), RequiredRouting::All);
assert_eq!(RequiredRouting::GridXY.combine(RequiredRouting::All), RequiredRouting::All);
assert_eq!(RequiredRouting::None.combine(RequiredRouting::All), RequiredRouting::All);
assert_eq!(RequiredRouting::None.combine(RequiredRouting::None), RequiredRouting::None);
assert_eq!(
RequiredRouting::PendingTopology.combine(RequiredRouting::GridX),
RequiredRouting::GridX
);
assert_eq!(
RequiredRouting::GridX.combine(RequiredRouting::PendingTopology),
RequiredRouting::GridX
);
assert_eq!(RequiredRouting::GridX.combine(RequiredRouting::GridY), RequiredRouting::GridXY);
assert_eq!(RequiredRouting::GridY.combine(RequiredRouting::GridX), RequiredRouting::GridXY);
assert_eq!(
RequiredRouting::GridXY.combine(RequiredRouting::GridXY),
RequiredRouting::GridXY
);
assert_eq!(RequiredRouting::GridX.combine(RequiredRouting::GridX), RequiredRouting::GridX);
assert_eq!(RequiredRouting::GridY.combine(RequiredRouting::GridY), RequiredRouting::GridY);
assert_eq!(RequiredRouting::None.combine(RequiredRouting::GridY), RequiredRouting::GridY);
assert_eq!(RequiredRouting::None.combine(RequiredRouting::GridX), RequiredRouting::GridX);
assert_eq!(RequiredRouting::None.combine(RequiredRouting::GridXY), RequiredRouting::GridXY);
assert_eq!(RequiredRouting::GridY.combine(RequiredRouting::None), RequiredRouting::GridY);
assert_eq!(RequiredRouting::GridX.combine(RequiredRouting::None), RequiredRouting::GridX);
assert_eq!(RequiredRouting::GridXY.combine(RequiredRouting::None), RequiredRouting::GridXY);
assert_eq!(
RequiredRouting::PendingTopology.combine(RequiredRouting::None),
RequiredRouting::PendingTopology
);
assert_eq!(
RequiredRouting::None.combine(RequiredRouting::PendingTopology),
RequiredRouting::PendingTopology
);
}
#[test]
fn test_random_routing_sample() {
// This test is fragile as it relies on a specific ChaCha12Rng
// sequence that might be implementation defined even for a static seed
let mut rng = dummy_rng();
let mut random_routing = RandomRouting { target: 4, sent: 0, sample_rate: 8 };
assert_eq!(random_routing.sample(16, &mut rng), true);
random_routing.inc_sent();
assert_eq!(random_routing.sample(16, &mut rng), false);
assert_eq!(random_routing.sample(16, &mut rng), false);
assert_eq!(random_routing.sample(16, &mut rng), true);
random_routing.inc_sent();
assert_eq!(random_routing.sample(16, &mut rng), true);
random_routing.inc_sent();
assert_eq!(random_routing.sample(16, &mut rng), false);
assert_eq!(random_routing.sample(16, &mut rng), false);
assert_eq!(random_routing.sample(16, &mut rng), false);
assert_eq!(random_routing.sample(16, &mut rng), true);
random_routing.inc_sent();
for _ in 0..16 {
assert_eq!(random_routing.sample(16, &mut rng), false);
}
}
fn run_random_routing(
random_routing: &mut RandomRouting,
rng: &mut (impl CryptoRng + Rng),
npeers: usize,
iters: usize,
) -> usize {
let mut ret = 0_usize;
for _ in 0..iters {
if random_routing.sample(npeers, rng) {
random_routing.inc_sent();
ret += 1;
}
}
ret
}
#[test]
fn test_random_routing_distribution() {
let mut rng = dummy_rng();
let mut random_routing = RandomRouting { target: 4, sent: 0, sample_rate: 8 };
assert_eq!(run_random_routing(&mut random_routing, &mut rng, 100, 10000), 4);
let mut random_routing = RandomRouting { target: 8, sent: 0, sample_rate: 100 };
assert_eq!(run_random_routing(&mut random_routing, &mut rng, 100, 10000), 8);
let mut random_routing = RandomRouting { target: 0, sent: 0, sample_rate: 100 };
assert_eq!(run_random_routing(&mut random_routing, &mut rng, 100, 10000), 0);
let mut random_routing = RandomRouting { target: 10, sent: 0, sample_rate: 10 };
assert_eq!(run_random_routing(&mut random_routing, &mut rng, 10, 100), 10);
}
#[test]
fn test_matrix_neighbors() {
for (our_index, len, expected_row, expected_column) in vec![
(0usize, 1usize, vec![], vec![]),
(1, 2, vec![], vec![0usize]),
(0, 9, vec![1, 2], vec![3, 6]),
(9, 10, vec![], vec![0, 3, 6]),
(10, 11, vec![9], vec![1, 4, 7]),
(7, 11, vec![6, 8], vec![1, 4, 10]),
]
.into_iter()
{
let matrix = matrix_neighbors(our_index, len).unwrap();
let mut row_result: Vec<_> = matrix.row_neighbors.collect();
let mut column_result: Vec<_> = matrix.column_neighbors.collect();
row_result.sort();
column_result.sort();
assert_eq!(row_result, expected_row);
assert_eq!(column_result, expected_column);
}
}
}
+773
View File
@@ -0,0 +1,773 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//! Network protocol types for teyrchains.
#![deny(unused_crate_dependencies)]
#![warn(missing_docs)]
use codec::{Decode, Encode};
use pezkuwi_primitives::{BlockNumber, Hash};
use std::fmt;
#[doc(hidden)]
pub use sc_network::IfDisconnected;
pub use sc_network_types::PeerId;
#[doc(hidden)]
pub use std::sync::Arc;
mod reputation;
pub use self::reputation::{ReputationChange, UnifiedReputationChange};
/// Peer-sets and protocols used for teyrchains.
pub mod peer_set;
/// Request/response protocols used in Pezkuwi.
pub mod request_response;
/// Accessing authority discovery service
pub mod authority_discovery;
/// Grid topology support module
pub mod grid_topology;
/// The minimum amount of peers to send gossip messages to.
pub const MIN_GOSSIP_PEERS: usize = 25;
/// An error indicating that this the over-arching message type had the wrong variant
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct WrongVariant;
impl fmt::Display for WrongVariant {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(formatter, "Wrong message variant")
}
}
impl std::error::Error for WrongVariant {}
/// The advertised role of a node.
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum ObservedRole {
/// A light node.
Light,
/// A full node.
Full,
/// A node claiming to be an authority (unauthenticated)
Authority,
}
impl From<sc_network::ObservedRole> for ObservedRole {
fn from(role: sc_network::ObservedRole) -> ObservedRole {
match role {
sc_network::ObservedRole::Light => ObservedRole::Light,
sc_network::ObservedRole::Authority => ObservedRole::Authority,
sc_network::ObservedRole::Full => ObservedRole::Full,
}
}
}
impl Into<sc_network::ObservedRole> for ObservedRole {
fn into(self) -> sc_network::ObservedRole {
match self {
ObservedRole::Light => sc_network::ObservedRole::Light,
ObservedRole::Full => sc_network::ObservedRole::Full,
ObservedRole::Authority => sc_network::ObservedRole::Authority,
}
}
}
/// Specialized wrapper around [`View`].
#[derive(Debug, Clone, Default)]
pub struct OurView {
view: View,
}
impl OurView {
/// Creates a new instance.
pub fn new(heads: impl IntoIterator<Item = Hash>, finalized_number: BlockNumber) -> Self {
let view = View::new(heads, finalized_number);
Self { view }
}
}
impl PartialEq for OurView {
fn eq(&self, other: &Self) -> bool {
self.view == other.view
}
}
impl std::ops::Deref for OurView {
type Target = View;
fn deref(&self) -> &View {
&self.view
}
}
/// Construct a new [`OurView`] with the given chain heads, finalized number 0
///
/// NOTE: Use for tests only.
///
/// # Example
///
/// ```
/// # use pezkuwi_node_network_protocol::our_view;
/// # use pezkuwi_primitives::Hash;
/// let our_view = our_view![Hash::repeat_byte(1), Hash::repeat_byte(2)];
/// ```
#[macro_export]
macro_rules! our_view {
( $( $hash:expr ),* $(,)? ) => {
$crate::OurView::new(
vec![ $( $hash.clone() ),* ].into_iter().map(|h| h),
0,
)
};
}
/// A succinct representation of a peer's view. This consists of a bounded amount of chain heads
/// and the highest known finalized block number.
///
/// Up to `N` (5?) chain heads.
#[derive(Default, Debug, Clone, PartialEq, Eq, Encode, Decode)]
pub struct View {
/// A bounded amount of chain heads.
/// Invariant: Sorted.
heads: Vec<Hash>,
/// The highest known finalized block number.
pub finalized_number: BlockNumber,
}
/// Construct a new view with the given chain heads and finalized number 0.
///
/// NOTE: Use for tests only.
///
/// # Example
///
/// ```
/// # use pezkuwi_node_network_protocol::view;
/// # use pezkuwi_primitives::Hash;
/// let view = view![Hash::repeat_byte(1), Hash::repeat_byte(2)];
/// ```
#[macro_export]
macro_rules! view {
( $( $hash:expr ),* $(,)? ) => {
$crate::View::new(vec![ $( $hash.clone() ),* ], 0)
};
}
impl View {
/// Construct a new view based on heads and a finalized block number.
pub fn new(heads: impl IntoIterator<Item = Hash>, finalized_number: BlockNumber) -> Self {
let mut heads = heads.into_iter().collect::<Vec<Hash>>();
heads.sort();
Self { heads, finalized_number }
}
/// Start with no heads, but only a finalized block number.
pub fn with_finalized(finalized_number: BlockNumber) -> Self {
Self { heads: Vec::new(), finalized_number }
}
/// Obtain the number of heads that are in view.
pub fn len(&self) -> usize {
self.heads.len()
}
/// Check if the number of heads contained, is null.
pub fn is_empty(&self) -> bool {
self.heads.is_empty()
}
/// Obtain an iterator over all heads.
pub fn iter(&self) -> impl Iterator<Item = &Hash> {
self.heads.iter()
}
/// Obtain an iterator over all heads.
pub fn into_iter(self) -> impl Iterator<Item = Hash> {
self.heads.into_iter()
}
/// Replace `self` with `new`.
///
/// Returns an iterator that will yield all elements of `new` that were not part of `self`.
pub fn replace_difference(&mut self, new: View) -> impl Iterator<Item = &Hash> {
let old = std::mem::replace(self, new);
self.heads.iter().filter(move |h| !old.contains(h))
}
/// Returns an iterator of the hashes present in `Self` but not in `other`.
pub fn difference<'a>(&'a self, other: &'a View) -> impl Iterator<Item = &'a Hash> + 'a {
self.heads.iter().filter(move |h| !other.contains(h))
}
/// An iterator containing hashes present in both `Self` and in `other`.
pub fn intersection<'a>(&'a self, other: &'a View) -> impl Iterator<Item = &'a Hash> + 'a {
self.heads.iter().filter(move |h| other.contains(h))
}
/// Whether the view contains a given hash.
pub fn contains(&self, hash: &Hash) -> bool {
self.heads.contains(hash)
}
/// Check if two views have the same heads.
///
/// Equivalent to the `PartialEq` function,
/// but ignores the `finalized_number` field.
pub fn check_heads_eq(&self, other: &Self) -> bool {
self.heads == other.heads
}
}
/// A protocol-versioned type for validation.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ValidationProtocols<V3> {
/// V3 type.
V3(V3),
}
/// A protocol-versioned type for collation.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum CollationProtocols<V1, V2> {
/// V1 type.
V1(V1),
/// V2 type.
V2(V2),
}
impl<V3: Clone> ValidationProtocols<&'_ V3> {
/// Convert to a fully-owned version of the message.
pub fn clone_inner(&self) -> ValidationProtocols<V3> {
match *self {
ValidationProtocols::V3(inner) => ValidationProtocols::V3(inner.clone()),
}
}
}
impl<V1: Clone, V2: Clone> CollationProtocols<&'_ V1, &'_ V2> {
/// Convert to a fully-owned version of the message.
pub fn clone_inner(&self) -> CollationProtocols<V1, V2> {
match *self {
CollationProtocols::V1(inner) => CollationProtocols::V1(inner.clone()),
CollationProtocols::V2(inner) => CollationProtocols::V2(inner.clone()),
}
}
}
/// All supported versions of the validation protocol message.
pub type VersionedValidationProtocol = ValidationProtocols<v3::ValidationProtocol>;
impl From<v3::ValidationProtocol> for VersionedValidationProtocol {
fn from(v3: v3::ValidationProtocol) -> Self {
VersionedValidationProtocol::V3(v3)
}
}
/// All supported versions of the collation protocol message.
pub type VersionedCollationProtocol =
CollationProtocols<v1::CollationProtocol, v2::CollationProtocol>;
impl From<v1::CollationProtocol> for VersionedCollationProtocol {
fn from(v1: v1::CollationProtocol) -> Self {
VersionedCollationProtocol::V1(v1)
}
}
impl From<v2::CollationProtocol> for VersionedCollationProtocol {
fn from(v2: v2::CollationProtocol) -> Self {
VersionedCollationProtocol::V2(v2)
}
}
macro_rules! impl_versioned_validation_full_protocol_from {
($from:ty, $out:ty, $variant:ident) => {
impl From<$from> for $out {
fn from(versioned_from: $from) -> $out {
match versioned_from {
ValidationProtocols::V3(x) => ValidationProtocols::V3(x.into()),
}
}
}
};
}
macro_rules! impl_versioned_collation_full_protocol_from {
($from:ty, $out:ty, $variant:ident) => {
impl From<$from> for $out {
fn from(versioned_from: $from) -> $out {
match versioned_from {
CollationProtocols::V1(x) => CollationProtocols::V1(x.into()),
CollationProtocols::V2(x) => CollationProtocols::V2(x.into()),
}
}
}
};
}
/// Implement `TryFrom` for one versioned validation enum variant into the inner type.
/// `$m_ty::$variant(inner) -> Ok(inner)`
macro_rules! impl_versioned_validation_try_from {
(
$from:ty,
$out:ty,
$v3_pat:pat => $v3_out:expr
) => {
impl TryFrom<$from> for $out {
type Error = crate::WrongVariant;
fn try_from(x: $from) -> Result<$out, Self::Error> {
#[allow(unreachable_patterns)] // when there is only one variant
match x {
ValidationProtocols::V3($v3_pat) => Ok(ValidationProtocols::V3($v3_out)),
_ => Err(crate::WrongVariant),
}
}
}
impl<'a> TryFrom<&'a $from> for $out {
type Error = crate::WrongVariant;
fn try_from(x: &'a $from) -> Result<$out, Self::Error> {
#[allow(unreachable_patterns)] // when there is only one variant
match x {
ValidationProtocols::V3($v3_pat) =>
Ok(ValidationProtocols::V3($v3_out.clone())),
_ => Err(crate::WrongVariant),
}
}
}
};
}
/// Implement `TryFrom` for one versioned collation enum variant into the inner type.
/// `$m_ty::$variant(inner) -> Ok(inner)`
macro_rules! impl_versioned_collation_try_from {
(
$from:ty,
$out:ty,
$v1_pat:pat => $v1_out:expr,
$v2_pat:pat => $v2_out:expr
) => {
impl TryFrom<$from> for $out {
type Error = crate::WrongVariant;
fn try_from(x: $from) -> Result<$out, Self::Error> {
#[allow(unreachable_patterns)] // when there is only one variant
match x {
CollationProtocols::V1($v1_pat) => Ok(CollationProtocols::V1($v1_out)),
CollationProtocols::V2($v2_pat) => Ok(CollationProtocols::V2($v2_out)),
_ => Err(crate::WrongVariant),
}
}
}
impl<'a> TryFrom<&'a $from> for $out {
type Error = crate::WrongVariant;
fn try_from(x: &'a $from) -> Result<$out, Self::Error> {
#[allow(unreachable_patterns)] // when there is only one variant
match x {
CollationProtocols::V1($v1_pat) => Ok(CollationProtocols::V1($v1_out.clone())),
CollationProtocols::V2($v2_pat) => Ok(CollationProtocols::V2($v2_out.clone())),
_ => Err(crate::WrongVariant),
}
}
}
};
}
/// Version-annotated messages used by the bitfield distribution subsystem.
pub type BitfieldDistributionMessage = ValidationProtocols<v3::BitfieldDistributionMessage>;
impl_versioned_validation_full_protocol_from!(
BitfieldDistributionMessage,
VersionedValidationProtocol,
BitfieldDistribution
);
impl_versioned_validation_try_from!(
VersionedValidationProtocol,
BitfieldDistributionMessage,
v3::ValidationProtocol::BitfieldDistribution(x) => x
);
/// Version-annotated messages used by the statement distribution subsystem.
pub type StatementDistributionMessage = ValidationProtocols<v3::StatementDistributionMessage>;
impl_versioned_validation_full_protocol_from!(
StatementDistributionMessage,
VersionedValidationProtocol,
StatementDistribution
);
impl_versioned_validation_try_from!(
VersionedValidationProtocol,
StatementDistributionMessage,
v3::ValidationProtocol::StatementDistribution(x) => x
);
/// Version-annotated messages used by the approval distribution subsystem.
pub type ApprovalDistributionMessage = ValidationProtocols<v3::ApprovalDistributionMessage>;
impl_versioned_validation_full_protocol_from!(
ApprovalDistributionMessage,
VersionedValidationProtocol,
ApprovalDistribution
);
impl_versioned_validation_try_from!(
VersionedValidationProtocol,
ApprovalDistributionMessage,
v3::ValidationProtocol::ApprovalDistribution(x) => x
);
/// Version-annotated messages used by the gossip-support subsystem (this is void).
pub type GossipSupportNetworkMessage = ValidationProtocols<v3::GossipSupportNetworkMessage>;
// This is a void enum placeholder, so never gets sent over the wire.
impl TryFrom<VersionedValidationProtocol> for GossipSupportNetworkMessage {
type Error = WrongVariant;
fn try_from(_: VersionedValidationProtocol) -> Result<Self, Self::Error> {
Err(WrongVariant)
}
}
impl<'a> TryFrom<&'a VersionedValidationProtocol> for GossipSupportNetworkMessage {
type Error = WrongVariant;
fn try_from(_: &'a VersionedValidationProtocol) -> Result<Self, Self::Error> {
Err(WrongVariant)
}
}
/// Version-annotated messages used by the collator protocol subsystem.
pub type CollatorProtocolMessage =
CollationProtocols<v1::CollatorProtocolMessage, v2::CollatorProtocolMessage>;
impl_versioned_collation_full_protocol_from!(
CollatorProtocolMessage,
VersionedCollationProtocol,
CollatorProtocol
);
impl_versioned_collation_try_from!(
VersionedCollationProtocol,
CollatorProtocolMessage,
v1::CollationProtocol::CollatorProtocol(x) => x,
v2::CollationProtocol::CollatorProtocol(x) => x
);
/// v1 notification protocol types.
pub mod v1 {
use codec::{Decode, Encode};
use pezkuwi_primitives::{CollatorId, CollatorSignature, Hash, Id as ParaId};
use pezkuwi_node_primitives::UncheckedSignedFullStatement;
/// Network messages used by the collator protocol subsystem
#[derive(Debug, Clone, Encode, Decode, PartialEq, Eq)]
pub enum CollatorProtocolMessage {
/// Declare the intent to advertise collations under a collator ID, attaching a
/// signature of the `PeerId` of the node using the given collator ID key.
#[codec(index = 0)]
Declare(CollatorId, ParaId, CollatorSignature),
/// Advertise a collation to a validator. Can only be sent once the peer has
/// declared that they are a collator with given ID.
#[codec(index = 1)]
AdvertiseCollation(Hash),
/// A collation sent to a validator was seconded.
#[codec(index = 4)]
CollationSeconded(Hash, UncheckedSignedFullStatement),
}
/// All network messages on the collation peer-set.
#[derive(Debug, Clone, Encode, Decode, PartialEq, Eq, derive_more::From)]
pub enum CollationProtocol {
/// Collator protocol messages
#[codec(index = 0)]
#[from]
CollatorProtocol(CollatorProtocolMessage),
}
/// Get the payload that should be signed and included in a `Declare` message.
///
/// The payload is the local peer id of the node, which serves to prove that it
/// controls the collator key it is declaring an intention to collate under.
pub fn declare_signature_payload(peer_id: &sc_network_types::PeerId) -> Vec<u8> {
let mut payload = peer_id.to_bytes();
payload.extend_from_slice(b"COLL");
payload
}
}
/// v2 network protocol types.
pub mod v2 {
use codec::{Decode, Encode};
use pezkuwi_primitives::{CandidateHash, CollatorId, CollatorSignature, Hash, Id as ParaId};
use pezkuwi_node_primitives::UncheckedSignedFullStatement;
/// This parts of the protocol did not change from v1, so just alias them in v2.
pub use super::v1::declare_signature_payload;
/// Network messages used by the collator protocol subsystem
#[derive(Debug, Clone, Encode, Decode, PartialEq, Eq)]
pub enum CollatorProtocolMessage {
/// Declare the intent to advertise collations under a collator ID, attaching a
/// signature of the `PeerId` of the node using the given collator ID key.
#[codec(index = 0)]
Declare(CollatorId, ParaId, CollatorSignature),
/// Advertise a collation to a validator. Can only be sent once the peer has
/// declared that they are a collator with given ID.
#[codec(index = 1)]
AdvertiseCollation {
/// Hash of the relay parent advertised collation is based on.
relay_parent: Hash,
/// Candidate hash.
candidate_hash: CandidateHash,
/// Teyrchain head data hash before candidate execution.
parent_head_data_hash: Hash,
},
/// A collation sent to a validator was seconded.
#[codec(index = 4)]
CollationSeconded(Hash, UncheckedSignedFullStatement),
}
/// All network messages on the collation peer-set.
#[derive(Debug, Clone, Encode, Decode, PartialEq, Eq, derive_more::From)]
pub enum CollationProtocol {
/// Collator protocol messages
#[codec(index = 0)]
#[from]
CollatorProtocol(CollatorProtocolMessage),
}
}
/// v3 network protocol types.
/// Purpose is for changing ApprovalDistributionMessage to
/// include more than one assignment and approval in a message.
pub mod v3 {
use bitvec::{order::Lsb0, slice::BitSlice, vec::BitVec};
use codec::{Decode, Encode};
use pezkuwi_primitives::{
CandidateHash, GroupIndex, Hash, Id as ParaId, UncheckedSignedAvailabilityBitfield,
UncheckedSignedStatement,
};
use pezkuwi_node_primitives::approval::v2::{
CandidateBitfield, IndirectAssignmentCertV2, IndirectSignedApprovalVoteV2,
};
/// This parts of the protocol did not change from v2, so just alias them in v3.
pub use super::v2::declare_signature_payload;
/// Network messages used by the bitfield distribution subsystem.
#[derive(Debug, Clone, Encode, Decode, PartialEq, Eq)]
pub enum BitfieldDistributionMessage {
/// A signed availability bitfield for a given relay-parent hash.
#[codec(index = 0)]
Bitfield(Hash, UncheckedSignedAvailabilityBitfield),
}
/// Bitfields indicating the statements that are known or undesired
/// about a candidate.
#[derive(Debug, Clone, Encode, Decode, PartialEq, Eq)]
pub struct StatementFilter {
/// Seconded statements. '1' is known or undesired.
pub seconded_in_group: BitVec<u8, Lsb0>,
/// Valid statements. '1' is known or undesired.
pub validated_in_group: BitVec<u8, Lsb0>,
}
impl StatementFilter {
/// Create a new blank filter with the given group size.
pub fn blank(group_size: usize) -> Self {
StatementFilter {
seconded_in_group: BitVec::repeat(false, group_size),
validated_in_group: BitVec::repeat(false, group_size),
}
}
/// Create a new full filter with the given group size.
pub fn full(group_size: usize) -> Self {
StatementFilter {
seconded_in_group: BitVec::repeat(true, group_size),
validated_in_group: BitVec::repeat(true, group_size),
}
}
/// Whether the filter has a specific expected length, consistent across both
/// bitfields.
pub fn has_len(&self, len: usize) -> bool {
self.seconded_in_group.len() == len && self.validated_in_group.len() == len
}
/// Determine the number of backing validators in the statement filter.
pub fn backing_validators(&self) -> usize {
self.seconded_in_group
.iter()
.by_vals()
.zip(self.validated_in_group.iter().by_vals())
.filter(|&(s, v)| s || v) // no double-counting
.count()
}
/// Whether the statement filter has at least one seconded statement.
pub fn has_seconded(&self) -> bool {
self.seconded_in_group.iter().by_vals().any(|x| x)
}
/// Mask out `Seconded` statements in `self` according to the provided
/// bitvec. Bits appearing in `mask` will not appear in `self` afterwards.
pub fn mask_seconded(&mut self, mask: &BitSlice<u8, Lsb0>) {
for (mut x, mask) in self
.seconded_in_group
.iter_mut()
.zip(mask.iter().by_vals().chain(std::iter::repeat(false)))
{
// (x, mask) => x
// (true, true) => false
// (true, false) => true
// (false, true) => false
// (false, false) => false
*x = *x && !mask;
}
}
/// Mask out `Valid` statements in `self` according to the provided
/// bitvec. Bits appearing in `mask` will not appear in `self` afterwards.
pub fn mask_valid(&mut self, mask: &BitSlice<u8, Lsb0>) {
for (mut x, mask) in self
.validated_in_group
.iter_mut()
.zip(mask.iter().by_vals().chain(std::iter::repeat(false)))
{
// (x, mask) => x
// (true, true) => false
// (true, false) => true
// (false, true) => false
// (false, false) => false
*x = *x && !mask;
}
}
}
/// A manifest of a known backed candidate, along with a description
/// of the statements backing it.
#[derive(Debug, Clone, Encode, Decode, PartialEq, Eq)]
pub struct BackedCandidateManifest {
/// The relay-parent of the candidate.
pub relay_parent: Hash,
/// The hash of the candidate.
pub candidate_hash: CandidateHash,
/// The group index backing the candidate at the relay-parent.
pub group_index: GroupIndex,
/// The para ID of the candidate. It is illegal for this to
/// be a para ID which is not assigned to the group indicated
/// in this manifest.
pub para_id: ParaId,
/// The head-data corresponding to the candidate.
pub parent_head_data_hash: Hash,
/// A statement filter which indicates which validators in the
/// para's group at the relay-parent have validated this candidate
/// and issued statements about it, to the advertiser's knowledge.
///
/// This MUST have exactly the minimum amount of bytes
/// necessary to represent the number of validators in the assigned
/// backing group as-of the relay-parent.
pub statement_knowledge: StatementFilter,
}
/// An acknowledgement of a backed candidate being known.
#[derive(Debug, Clone, Encode, Decode, PartialEq, Eq)]
pub struct BackedCandidateAcknowledgement {
/// The hash of the candidate.
pub candidate_hash: CandidateHash,
/// A statement filter which indicates which validators in the
/// para's group at the relay-parent have validated this candidate
/// and issued statements about it, to the advertiser's knowledge.
///
/// This MUST have exactly the minimum amount of bytes
/// necessary to represent the number of validators in the assigned
/// backing group as-of the relay-parent.
pub statement_knowledge: StatementFilter,
}
/// Network messages used by the statement distribution subsystem.
#[derive(Debug, Clone, Encode, Decode, PartialEq, Eq)]
pub enum StatementDistributionMessage {
/// A notification of a signed statement in compact form, for a given relay parent.
#[codec(index = 0)]
Statement(Hash, UncheckedSignedStatement),
/// A notification of a backed candidate being known by the
/// sending node, for the purpose of being requested by the receiving node
/// if needed.
#[codec(index = 1)]
BackedCandidateManifest(BackedCandidateManifest),
/// A notification of a backed candidate being known by the sending node,
/// for the purpose of informing a receiving node which already has the candidate.
#[codec(index = 2)]
BackedCandidateKnown(BackedCandidateAcknowledgement),
}
/// Network messages used by the approval distribution subsystem.
#[derive(Debug, Clone, Encode, Decode, PartialEq, Eq)]
pub enum ApprovalDistributionMessage {
/// Assignments for candidates in recent, unfinalized blocks.
/// We use a bitfield to reference claimed candidates, where the bit index is equal to
/// candidate index.
///
/// Actually checking the assignment may yield a different result.
///
/// TODO at next protocol upgrade opportunity:
/// - remove redundancy `candidate_index` vs `core_index`
/// - `<https://github.com/pezkuwichain/pezkuwi-sdk/issues/106>`
#[codec(index = 0)]
Assignments(Vec<(IndirectAssignmentCertV2, CandidateBitfield)>),
/// Approvals for candidates in some recent, unfinalized block.
#[codec(index = 1)]
Approvals(Vec<IndirectSignedApprovalVoteV2>),
}
/// Dummy network message type, so we will receive connect/disconnect events.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum GossipSupportNetworkMessage {}
/// All network messages on the validation peer-set.
#[derive(Debug, Clone, Encode, Decode, PartialEq, Eq, derive_more::From)]
pub enum ValidationProtocol {
/// Bitfield distribution messages
#[codec(index = 1)]
#[from]
BitfieldDistribution(BitfieldDistributionMessage),
/// Statement distribution messages
#[codec(index = 3)]
#[from]
StatementDistribution(StatementDistributionMessage),
/// Approval distribution messages
#[codec(index = 4)]
#[from]
ApprovalDistribution(ApprovalDistributionMessage),
}
}
/// Returns the subset of `peers` with the specified `version`.
pub fn filter_by_peer_version(
peers: &[(PeerId, peer_set::ProtocolVersion)],
version: peer_set::ProtocolVersion,
) -> Vec<PeerId> {
peers.iter().filter(|(_, v)| v == &version).map(|(p, _)| *p).collect::<Vec<_>>()
}
@@ -0,0 +1,616 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//! All peersets and protocols used for teyrchains.
use derive_more::Display;
use pezkuwi_primitives::Hash;
use sc_network::{
config::SetConfig, peer_store::PeerStoreProvider, service::NotificationMetrics,
types::ProtocolName, NetworkBackend, NotificationService,
};
use sp_runtime::traits::Block;
use std::{
collections::{hash_map::Entry, HashMap},
ops::{Index, IndexMut},
sync::Arc,
};
use strum::{EnumIter, IntoEnumIterator};
/// The legacy collation protocol name. Only supported on version = 1.
const LEGACY_COLLATION_PROTOCOL_V1: &str = "/pezkuwi/collation/1";
/// The legacy protocol version. Is always 1 for collation.
const LEGACY_COLLATION_PROTOCOL_VERSION_V1: u32 = 1;
/// Max notification size is currently constant.
pub const MAX_NOTIFICATION_SIZE: u64 = 100 * 1024;
/// Maximum allowed incoming connection streams for validator nodes on the collation protocol.
pub const MAX_AUTHORITY_INCOMING_STREAMS: u32 = 100;
/// The peer-sets and thus the protocols which are used for the network.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, EnumIter)]
pub enum PeerSet {
/// The validation peer-set is responsible for all messages related to candidate validation and
/// communication among validators.
Validation,
/// The collation peer-set is used for validator<>collator communication.
Collation,
}
/// Whether a node is an authority or not.
///
/// Peer set configuration gets adjusted accordingly.
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub enum IsAuthority {
/// Node is authority.
Yes,
/// Node is not an authority.
No,
}
impl PeerSet {
/// Get `sc_network` peer set configurations for each peerset on the default version.
///
/// Those should be used in the network configuration to register the protocols with the
/// network service.
pub fn get_info<B: Block, N: NetworkBackend<B, <B as Block>::Hash>>(
self,
is_authority: IsAuthority,
peerset_protocol_names: &PeerSetProtocolNames,
metrics: NotificationMetrics,
peer_store_handle: Arc<dyn PeerStoreProvider>,
) -> (N::NotificationProtocolConfig, (PeerSet, Box<dyn NotificationService>)) {
// Networking layer relies on `get_main_name()` being the main name of the protocol
// for peersets and connection management.
let protocol = peerset_protocol_names.get_main_name(self);
let fallback_names = PeerSetProtocolNames::get_fallback_names(
self,
&peerset_protocol_names.genesis_hash,
peerset_protocol_names.fork_id.as_deref(),
);
let max_notification_size = self.get_max_notification_size(is_authority);
match self {
PeerSet::Validation => {
let (config, notification_service) = N::notification_config(
protocol,
fallback_names,
max_notification_size,
None,
SetConfig {
// we allow full nodes to connect to validators for gossip
// to ensure any `MIN_GOSSIP_PEERS` always include reserved peers
// we limit the amount of non-reserved slots to be less
// than `MIN_GOSSIP_PEERS` in total
in_peers: super::MIN_GOSSIP_PEERS as u32 / 2 - 1,
out_peers: super::MIN_GOSSIP_PEERS as u32 / 2 - 1,
reserved_nodes: Vec::new(),
non_reserved_mode: sc_network::config::NonReservedPeerMode::Accept,
},
metrics,
peer_store_handle,
);
(config, (PeerSet::Validation, notification_service))
},
PeerSet::Collation => {
let (config, notification_service) = N::notification_config(
protocol,
fallback_names,
max_notification_size,
None,
SetConfig {
// Non-authority nodes don't need to accept incoming connections on this
// peer set:
in_peers: if is_authority == IsAuthority::Yes {
MAX_AUTHORITY_INCOMING_STREAMS
} else {
0
},
out_peers: 0,
reserved_nodes: Vec::new(),
non_reserved_mode: if is_authority == IsAuthority::Yes {
sc_network::config::NonReservedPeerMode::Accept
} else {
sc_network::config::NonReservedPeerMode::Deny
},
},
metrics,
peer_store_handle,
);
(config, (PeerSet::Collation, notification_service))
},
}
}
/// Get the main protocol version for this peer set.
///
/// Networking layer relies on `get_main_version()` being the version
/// of the main protocol name reported by [`PeerSetProtocolNames::get_main_name()`].
pub fn get_main_version(self) -> ProtocolVersion {
match self {
PeerSet::Validation => ValidationVersion::V3.into(),
PeerSet::Collation => CollationVersion::V2.into(),
}
}
/// Get the max notification size for this peer set.
pub fn get_max_notification_size(self, _: IsAuthority) -> u64 {
MAX_NOTIFICATION_SIZE
}
/// Get the peer set label for metrics reporting.
pub fn get_label(self) -> &'static str {
match self {
PeerSet::Validation => "validation",
PeerSet::Collation => "collation",
}
}
/// Get the protocol label for metrics reporting.
pub fn get_protocol_label(self, version: ProtocolVersion) -> Option<&'static str> {
// Unfortunately, labels must be static strings, so we must manually cover them
// for all protocol versions here.
match self {
PeerSet::Validation =>
if version == ValidationVersion::V3.into() {
Some("validation/3")
} else {
None
},
PeerSet::Collation =>
if version == CollationVersion::V1.into() {
Some("collation/1")
} else if version == CollationVersion::V2.into() {
Some("collation/2")
} else {
None
},
}
}
}
/// A small and nifty collection that allows to store data pertaining to each peer set.
#[derive(Debug, Default)]
pub struct PerPeerSet<T> {
validation: T,
collation: T,
}
impl<T> Index<PeerSet> for PerPeerSet<T> {
type Output = T;
fn index(&self, index: PeerSet) -> &T {
match index {
PeerSet::Validation => &self.validation,
PeerSet::Collation => &self.collation,
}
}
}
impl<T> IndexMut<PeerSet> for PerPeerSet<T> {
fn index_mut(&mut self, index: PeerSet) -> &mut T {
match index {
PeerSet::Validation => &mut self.validation,
PeerSet::Collation => &mut self.collation,
}
}
}
/// Get `NonDefaultSetConfig`s for all available peer sets, at their default versions.
///
/// Should be used during network configuration (added to `NetworkConfiguration::extra_sets`)
/// or shortly after startup to register the protocols with the network service.
pub fn peer_sets_info<B: Block, N: NetworkBackend<B, <B as Block>::Hash>>(
is_authority: IsAuthority,
peerset_protocol_names: &PeerSetProtocolNames,
metrics: NotificationMetrics,
peer_store_handle: Arc<dyn PeerStoreProvider>,
) -> Vec<(N::NotificationProtocolConfig, (PeerSet, Box<dyn NotificationService>))> {
PeerSet::iter()
.map(|s| {
s.get_info::<B, N>(
is_authority,
&peerset_protocol_names,
metrics.clone(),
Arc::clone(&peer_store_handle),
)
})
.collect()
}
/// A generic version of the protocol. This struct must not be created directly.
#[derive(Debug, Clone, Copy, Display, PartialEq, Eq, Hash)]
pub struct ProtocolVersion(u32);
impl From<ProtocolVersion> for u32 {
fn from(version: ProtocolVersion) -> u32 {
version.0
}
}
/// Supported validation protocol versions. Only versions defined here must be used in the codebase.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, EnumIter)]
pub enum ValidationVersion {
/// The third version.
V3 = 3,
}
/// Supported collation protocol versions. Only versions defined here must be used in the codebase.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, EnumIter)]
pub enum CollationVersion {
/// The first version.
V1 = 1,
/// The second version.
V2 = 2,
}
/// Marker indicating the version is unknown.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct UnknownVersion;
impl TryFrom<ProtocolVersion> for ValidationVersion {
type Error = UnknownVersion;
fn try_from(p: ProtocolVersion) -> Result<Self, UnknownVersion> {
for v in Self::iter() {
if v as u32 == p.0 {
return Ok(v);
}
}
Err(UnknownVersion)
}
}
impl TryFrom<ProtocolVersion> for CollationVersion {
type Error = UnknownVersion;
fn try_from(p: ProtocolVersion) -> Result<Self, UnknownVersion> {
for v in Self::iter() {
if v as u32 == p.0 {
return Ok(v);
}
}
Err(UnknownVersion)
}
}
impl From<ValidationVersion> for ProtocolVersion {
fn from(version: ValidationVersion) -> ProtocolVersion {
ProtocolVersion(version as u32)
}
}
impl From<CollationVersion> for ProtocolVersion {
fn from(version: CollationVersion) -> ProtocolVersion {
ProtocolVersion(version as u32)
}
}
/// On the wire protocol name to [`PeerSet`] mapping.
#[derive(Debug, Clone)]
pub struct PeerSetProtocolNames {
protocols: HashMap<ProtocolName, (PeerSet, ProtocolVersion)>,
names: HashMap<(PeerSet, ProtocolVersion), ProtocolName>,
genesis_hash: Hash,
fork_id: Option<String>,
}
impl PeerSetProtocolNames {
/// Construct [`PeerSetProtocolNames`] using `genesis_hash` and `fork_id`.
pub fn new(genesis_hash: Hash, fork_id: Option<&str>) -> Self {
let mut protocols = HashMap::new();
let mut names = HashMap::new();
for protocol in PeerSet::iter() {
match protocol {
PeerSet::Validation =>
for version in ValidationVersion::iter() {
Self::register_main_protocol(
&mut protocols,
&mut names,
protocol,
version.into(),
&genesis_hash,
fork_id,
);
},
PeerSet::Collation => {
for version in CollationVersion::iter() {
Self::register_main_protocol(
&mut protocols,
&mut names,
protocol,
version.into(),
&genesis_hash,
fork_id,
);
}
Self::register_legacy_collation_protocol(&mut protocols, protocol);
},
}
}
Self { protocols, names, genesis_hash, fork_id: fork_id.map(|fork_id| fork_id.into()) }
}
/// Helper function to register main protocol.
fn register_main_protocol(
protocols: &mut HashMap<ProtocolName, (PeerSet, ProtocolVersion)>,
names: &mut HashMap<(PeerSet, ProtocolVersion), ProtocolName>,
protocol: PeerSet,
version: ProtocolVersion,
genesis_hash: &Hash,
fork_id: Option<&str>,
) {
let protocol_name = Self::generate_name(genesis_hash, fork_id, protocol, version);
names.insert((protocol, version), protocol_name.clone());
Self::insert_protocol_or_panic(protocols, protocol_name, protocol, version);
}
/// Helper function to register legacy collation protocol.
fn register_legacy_collation_protocol(
protocols: &mut HashMap<ProtocolName, (PeerSet, ProtocolVersion)>,
protocol: PeerSet,
) {
Self::insert_protocol_or_panic(
protocols,
LEGACY_COLLATION_PROTOCOL_V1.into(),
protocol,
ProtocolVersion(LEGACY_COLLATION_PROTOCOL_VERSION_V1),
)
}
/// Helper function to make sure no protocols have the same name.
fn insert_protocol_or_panic(
protocols: &mut HashMap<ProtocolName, (PeerSet, ProtocolVersion)>,
name: ProtocolName,
protocol: PeerSet,
version: ProtocolVersion,
) {
match protocols.entry(name) {
Entry::Vacant(entry) => {
entry.insert((protocol, version));
},
Entry::Occupied(entry) => {
panic!(
"Protocol {:?} (version {}) has the same on-the-wire name as protocol {:?} (version {}): `{}`.",
protocol,
version,
entry.get().0,
entry.get().1,
entry.key(),
);
},
}
}
/// Lookup the protocol using its on the wire name.
pub fn try_get_protocol(&self, name: &ProtocolName) -> Option<(PeerSet, ProtocolVersion)> {
self.protocols.get(name).map(ToOwned::to_owned)
}
/// Get the main protocol name. It's used by the networking for keeping track
/// of peersets and connections.
pub fn get_main_name(&self, protocol: PeerSet) -> ProtocolName {
self.get_name(protocol, protocol.get_main_version())
}
/// Get the protocol name for specific version.
pub fn get_name(&self, protocol: PeerSet, version: ProtocolVersion) -> ProtocolName {
self.names
.get(&(protocol, version))
.expect("Protocols & versions are specified via enums defined above, and they are all registered in `new()`; qed")
.clone()
}
/// The protocol name of this protocol based on `genesis_hash` and `fork_id`.
fn generate_name(
genesis_hash: &Hash,
fork_id: Option<&str>,
protocol: PeerSet,
version: ProtocolVersion,
) -> ProtocolName {
let prefix = if let Some(fork_id) = fork_id {
format!("/{}/{}", hex::encode(genesis_hash), fork_id)
} else {
format!("/{}", hex::encode(genesis_hash))
};
let short_name = match protocol {
PeerSet::Validation => "validation",
PeerSet::Collation => "collation",
};
format!("{}/{}/{}", prefix, short_name, version).into()
}
/// Get the protocol fallback names. Currently, it only holds
/// the legacy name for the collation protocol version 1.
fn get_fallback_names(
protocol: PeerSet,
_genesis_hash: &Hash,
_fork_id: Option<&str>,
) -> Vec<ProtocolName> {
let mut fallbacks = vec![];
match protocol {
PeerSet::Validation => {
// The validation protocol no longer supports protocol versions 1 and 2,
// and only version 3 is used. Therefore, fallback protocols remain empty.
},
PeerSet::Collation => {
fallbacks.push(LEGACY_COLLATION_PROTOCOL_V1.into());
},
};
fallbacks
}
}
#[cfg(test)]
mod tests {
use super::{
CollationVersion, Hash, PeerSet, PeerSetProtocolNames, ProtocolVersion, ValidationVersion,
};
use strum::IntoEnumIterator;
struct TestVersion(u32);
impl From<TestVersion> for ProtocolVersion {
fn from(version: TestVersion) -> ProtocolVersion {
ProtocolVersion(version.0)
}
}
#[test]
fn protocol_names_are_correctly_generated() {
let genesis_hash = Hash::from([
122, 200, 116, 29, 232, 183, 20, 109, 138, 86, 23, 253, 70, 41, 20, 85, 127, 230, 60,
38, 90, 127, 28, 16, 231, 218, 227, 40, 88, 238, 187, 128,
]);
let name = PeerSetProtocolNames::generate_name(
&genesis_hash,
None,
PeerSet::Validation,
TestVersion(3).into(),
);
let expected =
"/7ac8741de8b7146d8a5617fd462914557fe63c265a7f1c10e7dae32858eebb80/validation/3";
assert_eq!(name, expected.into());
let name = PeerSetProtocolNames::generate_name(
&genesis_hash,
None,
PeerSet::Collation,
TestVersion(5).into(),
);
let expected =
"/7ac8741de8b7146d8a5617fd462914557fe63c265a7f1c10e7dae32858eebb80/collation/5";
assert_eq!(name, expected.into());
let fork_id = Some("test-fork");
let name = PeerSetProtocolNames::generate_name(
&genesis_hash,
fork_id,
PeerSet::Validation,
TestVersion(7).into(),
);
let expected =
"/7ac8741de8b7146d8a5617fd462914557fe63c265a7f1c10e7dae32858eebb80/test-fork/validation/7";
assert_eq!(name, expected.into());
let name = PeerSetProtocolNames::generate_name(
&genesis_hash,
fork_id,
PeerSet::Collation,
TestVersion(11).into(),
);
let expected =
"/7ac8741de8b7146d8a5617fd462914557fe63c265a7f1c10e7dae32858eebb80/test-fork/collation/11";
assert_eq!(name, expected.into());
}
#[test]
fn all_protocol_names_are_known() {
let genesis_hash = Hash::from([
122, 200, 116, 29, 232, 183, 20, 109, 138, 86, 23, 253, 70, 41, 20, 85, 127, 230, 60,
38, 90, 127, 28, 16, 231, 218, 227, 40, 88, 238, 187, 128,
]);
let protocol_names = PeerSetProtocolNames::new(genesis_hash, None);
let validation_main =
"/7ac8741de8b7146d8a5617fd462914557fe63c265a7f1c10e7dae32858eebb80/validation/3";
assert_eq!(
protocol_names.try_get_protocol(&validation_main.into()),
Some((PeerSet::Validation, TestVersion(3).into())),
);
let validation_legacy = "/pezkuwi/validation/1";
assert!(protocol_names.try_get_protocol(&validation_legacy.into()).is_none());
let collation_main =
"/7ac8741de8b7146d8a5617fd462914557fe63c265a7f1c10e7dae32858eebb80/collation/1";
assert_eq!(
protocol_names.try_get_protocol(&collation_main.into()),
Some((PeerSet::Collation, TestVersion(1).into())),
);
let collation_legacy = "/pezkuwi/collation/1";
assert_eq!(
protocol_names.try_get_protocol(&collation_legacy.into()),
Some((PeerSet::Collation, TestVersion(1).into())),
);
}
#[test]
fn all_protocol_versions_are_registered() {
let genesis_hash = Hash::from([
122, 200, 116, 29, 232, 183, 20, 109, 138, 86, 23, 253, 70, 41, 20, 85, 127, 230, 60,
38, 90, 127, 28, 16, 231, 218, 227, 40, 88, 238, 187, 128,
]);
let protocol_names = PeerSetProtocolNames::new(genesis_hash, None);
for protocol in PeerSet::iter() {
match protocol {
PeerSet::Validation =>
for version in ValidationVersion::iter() {
assert_eq!(
protocol_names.get_name(protocol, version.into()),
PeerSetProtocolNames::generate_name(
&genesis_hash,
None,
protocol,
version.into(),
),
);
},
PeerSet::Collation =>
for version in CollationVersion::iter() {
assert_eq!(
protocol_names.get_name(protocol, version.into()),
PeerSetProtocolNames::generate_name(
&genesis_hash,
None,
protocol,
version.into(),
),
);
},
}
}
}
#[test]
fn all_protocol_versions_have_labels() {
for protocol in PeerSet::iter() {
match protocol {
PeerSet::Validation =>
for version in ValidationVersion::iter() {
protocol
.get_protocol_label(version.into())
.expect("All validation protocol versions must have a label.");
},
PeerSet::Collation =>
for version in CollationVersion::iter() {
protocol
.get_protocol_label(version.into())
.expect("All collation protocol versions must have a label.");
},
}
}
}
}
@@ -0,0 +1,90 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
pub use sc_network::ReputationChange;
/// Unified annoyance cost and good behavior benefits.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[allow(missing_docs)]
pub enum UnifiedReputationChange {
CostMajor(&'static str),
CostMinor(&'static str),
CostMajorRepeated(&'static str),
CostMinorRepeated(&'static str),
Malicious(&'static str),
BenefitMinorFirst(&'static str),
BenefitMinor(&'static str),
BenefitMajorFirst(&'static str),
BenefitMajor(&'static str),
}
impl UnifiedReputationChange {
/// Obtain the cost or benefit associated with
/// the enum variant.
///
/// Order of magnitude rationale:
///
/// * the peerset will not connect to a peer whose reputation is below a fixed value
/// * `max(2% *$rep, 1)` is the delta of convergence towards a reputation of 0
///
/// The whole range of an `i32` should be used, so order of magnitude of
/// something malicious should be `1<<20` (give or take).
pub const fn cost_or_benefit(&self) -> i32 {
match self {
Self::CostMinor(_) => -100_000,
Self::CostMajor(_) => -300_000,
Self::CostMinorRepeated(_) => -200_000,
Self::CostMajorRepeated(_) => -600_000,
Self::Malicious(_) => i32::MIN,
Self::BenefitMajorFirst(_) => 300_000,
Self::BenefitMajor(_) => 200_000,
Self::BenefitMinorFirst(_) => 15_000,
Self::BenefitMinor(_) => 10_000,
}
}
/// Extract the static description.
pub const fn description(&self) -> &'static str {
match self {
Self::CostMinor(description) => description,
Self::CostMajor(description) => description,
Self::CostMinorRepeated(description) => description,
Self::CostMajorRepeated(description) => description,
Self::Malicious(description) => description,
Self::BenefitMajorFirst(description) => description,
Self::BenefitMajor(description) => description,
Self::BenefitMinorFirst(description) => description,
Self::BenefitMinor(description) => description,
}
}
/// Whether the reputation change is for good behavior.
pub const fn is_benefit(&self) -> bool {
match self {
Self::BenefitMajorFirst(_) |
Self::BenefitMajor(_) |
Self::BenefitMinorFirst(_) |
Self::BenefitMinor(_) => true,
_ => false,
}
}
}
impl From<UnifiedReputationChange> for ReputationChange {
fn from(value: UnifiedReputationChange) -> Self {
ReputationChange::new(value.cost_or_benefit(), value.description())
}
}
@@ -0,0 +1,41 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//! Error handling related code and Error/Result definitions.
use sc_network_types::PeerId;
use codec::Error as DecodingError;
#[allow(missing_docs)]
#[fatality::fatality(splitable)]
pub enum Error {
// Incoming request stream exhausted. Should only happen on shutdown.
#[fatal]
#[error("Incoming request channel got closed.")]
RequestChannelExhausted,
/// Decoding failed, we were able to change the peer's reputation accordingly.
#[error("Decoding request failed for peer {0}.")]
DecodingError(PeerId, #[source] DecodingError),
/// Decoding failed, but sending reputation change failed.
#[error("Decoding request failed for peer {0}, and changing reputation failed.")]
DecodingErrorNoReputationChange(PeerId, #[source] DecodingError),
}
/// General result based on above `Error`.
pub type Result<T> = std::result::Result<T, Error>;
@@ -0,0 +1,232 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use std::marker::PhantomData;
use futures::{channel::oneshot, StreamExt};
use codec::{Decode, Encode};
use sc_network::{config as netconfig, NetworkBackend};
use sc_network_types::PeerId;
use sp_runtime::traits::Block;
use super::{IsRequest, ReqProtocolNames};
use crate::UnifiedReputationChange;
mod error;
pub use error::{Error, FatalError, JfyiError, Result};
/// A request coming in, including a sender for sending responses.
///
/// Typed `IncomingRequest`s, see `IncomingRequest::get_config_receiver` and substrate
/// `NetworkConfiguration` for more information.
#[derive(Debug)]
pub struct IncomingRequest<Req> {
/// `PeerId` of sending peer.
pub peer: PeerId,
/// The sent request.
pub payload: Req,
/// Sender for sending response back.
pub pending_response: OutgoingResponseSender<Req>,
}
impl<Req> IncomingRequest<Req>
where
Req: IsRequest + Decode + Encode,
Req::Response: Encode,
{
/// Create configuration for `NetworkConfiguration::request_response_protocols` and a
/// corresponding typed receiver.
///
/// This Register that config with substrate networking and receive incoming requests via the
/// returned `IncomingRequestReceiver`.
pub fn get_config_receiver<B: Block, N: NetworkBackend<B, <B as Block>::Hash>>(
req_protocol_names: &ReqProtocolNames,
) -> (IncomingRequestReceiver<Req>, N::RequestResponseProtocolConfig) {
let (raw, cfg) = Req::PROTOCOL.get_config::<B, N>(req_protocol_names);
(IncomingRequestReceiver { raw, phantom: PhantomData {} }, cfg)
}
/// Create new `IncomingRequest`.
pub fn new(
peer: PeerId,
payload: Req,
pending_response: oneshot::Sender<netconfig::OutgoingResponse>,
) -> Self {
Self {
peer,
payload,
pending_response: OutgoingResponseSender { pending_response, phantom: PhantomData {} },
}
}
/// Try building from raw substrate request.
///
/// This function will fail if the request cannot be decoded and will apply passed in
/// reputation changes in that case.
///
/// Params:
/// - The raw request to decode
/// - Reputation changes to apply for the peer in case decoding fails.
fn try_from_raw(
raw: sc_network::config::IncomingRequest,
reputation_changes: Vec<UnifiedReputationChange>,
) -> std::result::Result<Self, JfyiError> {
let sc_network::config::IncomingRequest { payload, peer, pending_response } = raw;
let payload = match Req::decode(&mut payload.as_ref()) {
Ok(payload) => payload,
Err(err) => {
let reputation_changes = reputation_changes.into_iter().map(|r| r.into()).collect();
let response = sc_network::config::OutgoingResponse {
result: Err(()),
reputation_changes,
sent_feedback: None,
};
if let Err(_) = pending_response.send(response) {
return Err(JfyiError::DecodingErrorNoReputationChange(peer, err));
}
return Err(JfyiError::DecodingError(peer, err));
},
};
Ok(Self::new(peer, payload, pending_response))
}
/// Convert into raw untyped substrate `IncomingRequest`.
///
/// This is mostly useful for testing.
pub fn into_raw(self) -> sc_network::config::IncomingRequest {
sc_network::config::IncomingRequest {
peer: self.peer,
payload: self.payload.encode(),
pending_response: self.pending_response.pending_response,
}
}
/// Send the response back.
///
/// Calls [`OutgoingResponseSender::send_response`].
pub fn send_response(self, resp: Req::Response) -> std::result::Result<(), Req::Response> {
self.pending_response.send_response(resp)
}
/// Send response with additional options.
///
/// Calls [`OutgoingResponseSender::send_outgoing_response`].
pub fn send_outgoing_response(
self,
resp: OutgoingResponse<<Req as IsRequest>::Response>,
) -> std::result::Result<(), ()> {
self.pending_response.send_outgoing_response(resp)
}
}
/// Sender for sending back responses on an `IncomingRequest`.
#[derive(Debug)]
pub struct OutgoingResponseSender<Req> {
pending_response: oneshot::Sender<netconfig::OutgoingResponse>,
phantom: PhantomData<Req>,
}
impl<Req> OutgoingResponseSender<Req>
where
Req: IsRequest + Decode,
Req::Response: Encode,
{
/// Send the response back.
///
/// On success we return `Ok(())`, on error we return the not sent `Response`.
///
/// `netconfig::OutgoingResponse` exposes a way of modifying the peer's reputation. If needed we
/// can change this function to expose this feature as well.
pub fn send_response(self, resp: Req::Response) -> std::result::Result<(), Req::Response> {
self.pending_response
.send(netconfig::OutgoingResponse {
result: Ok(resp.encode()),
reputation_changes: Vec::new(),
sent_feedback: None,
})
.map_err(|_| resp)
}
/// Send response with additional options.
///
/// This variant allows for waiting for the response to be sent out, allows for changing peer's
/// reputation and allows for not sending a response at all (for only changing the peer's
/// reputation).
pub fn send_outgoing_response(
self,
resp: OutgoingResponse<<Req as IsRequest>::Response>,
) -> std::result::Result<(), ()> {
let OutgoingResponse { result, reputation_changes, sent_feedback } = resp;
let response = netconfig::OutgoingResponse {
result: result.map(|v| v.encode()),
reputation_changes: reputation_changes.into_iter().map(|c| c.into()).collect(),
sent_feedback,
};
self.pending_response.send(response).map_err(|_| ())
}
}
/// Typed variant of [`netconfig::OutgoingResponse`].
///
/// Responses to `IncomingRequest`s.
pub struct OutgoingResponse<Response> {
/// The payload of the response.
///
/// `Err(())` if none is available e.g. due to an error while handling the request.
pub result: std::result::Result<Response, ()>,
/// Reputation changes accrued while handling the request. To be applied to the reputation of
/// the peer sending the request.
pub reputation_changes: Vec<UnifiedReputationChange>,
/// If provided, the `oneshot::Sender` will be notified when the request has been sent to the
/// peer.
pub sent_feedback: Option<oneshot::Sender<()>>,
}
/// Receiver for incoming requests.
///
/// Takes care of decoding and handling of invalid encoded requests.
pub struct IncomingRequestReceiver<Req> {
raw: async_channel::Receiver<netconfig::IncomingRequest>,
phantom: PhantomData<Req>,
}
impl<Req> IncomingRequestReceiver<Req>
where
Req: IsRequest + Decode + Encode,
Req::Response: Encode,
{
/// Try to receive the next incoming request.
///
/// Any received request will be decoded, on decoding errors the provided reputation changes
/// will be applied and an error will be reported.
pub async fn recv<F>(&mut self, reputation_changes: F) -> Result<IncomingRequest<Req>>
where
F: FnOnce() -> Vec<UnifiedReputationChange>,
{
let req = match self.raw.next().await {
None => return Err(FatalError::RequestChannelExhausted.into()),
Some(raw) => IncomingRequest::<Req>::try_from_raw(raw, reputation_changes())?,
};
Ok(req)
}
}
@@ -0,0 +1,377 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//! Overview over request/responses as used in `Pezkuwi`.
//!
//! `enum Protocol` .... List of all supported protocols.
//!
//! `enum Requests` .... List of all supported requests, each entry matches one in protocols, but
//! has the actual request as payload.
//!
//! `struct IncomingRequest` .... wrapper for incoming requests, containing a sender for sending
//! responses.
//!
//! `struct OutgoingRequest` .... wrapper for outgoing requests, containing a sender used by the
//! networking code for delivering responses/delivery errors.
//!
//! `trait IsRequest` .... A trait describing a particular request. It is used for gathering meta
//! data, like what is the corresponding response type.
//!
//! ## Versioning
//!
//! Versioning for request-response protocols can be done in multiple ways.
//!
//! If you're just changing the protocol name but the binary payloads are the same, just add a new
//! `fallback_name` to the protocol config.
//!
//! One way in which versioning has historically been achieved for req-response protocols is to
//! bundle the new req-resp version with an upgrade of a notifications protocol. The subsystem would
//! then know which request version to use based on stored data about the peer's notifications
//! protocol version.
//!
//! When bumping a notifications protocol version is not needed/desirable, you may add a new
//! req-resp protocol and set the old request as a fallback (see
//! `OutgoingRequest::new_with_fallback`). A request with the new version will be attempted and if
//! the protocol is refused by the peer, the fallback protocol request will be used.
//! Information about the actually used protocol will be returned alongside the raw response, so
//! that you know how to decode it.
use std::{collections::HashMap, time::Duration, u64};
use pezkuwi_primitives::MAX_CODE_SIZE;
use sc_network::{NetworkBackend, MAX_RESPONSE_SIZE};
use sp_runtime::traits::Block;
use strum::{EnumIter, IntoEnumIterator};
pub use sc_network::{config as network, config::RequestResponseConfig, ProtocolName};
/// Everything related to handling of incoming requests.
pub mod incoming;
/// Everything related to handling of outgoing requests.
pub mod outgoing;
pub use incoming::{IncomingRequest, IncomingRequestReceiver};
pub use outgoing::{OutgoingRequest, OutgoingResult, Recipient, Requests, ResponseSender};
///// Multiplexer for incoming requests.
// pub mod multiplexer;
/// Actual versioned requests and responses that are sent over the wire.
pub mod v1;
/// Actual versioned requests and responses that are sent over the wire.
pub mod v2;
/// A protocol per subsystem seems to make the most sense, this way we don't need any dispatching
/// within protocols.
#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, EnumIter)]
pub enum Protocol {
/// Protocol for chunk fetching, used by availability distribution and availability recovery.
ChunkFetchingV1,
/// Protocol for fetching collations from collators.
CollationFetchingV1,
/// Protocol for fetching collations from collators when async backing is enabled.
CollationFetchingV2,
/// Protocol for fetching seconded PoVs from validators of the same group.
PoVFetchingV1,
/// Protocol for fetching available data.
AvailableDataFetchingV1,
/// Sending of dispute statements with application level confirmations.
DisputeSendingV1,
/// Protocol for requesting candidates with attestations in statement distribution
/// when async backing is enabled.
AttestedCandidateV2,
/// Protocol for chunk fetching version 2, used by availability distribution and availability
/// recovery.
ChunkFetchingV2,
}
/// Minimum bandwidth we expect for validators - 500Mbit/s is the recommendation, so approximately
/// 50MB per second:
const MIN_BANDWIDTH_BYTES: u64 = 50 * 1024 * 1024;
/// Default request timeout in seconds.
///
/// When decreasing this value, take into account that the very first request might need to open a
/// connection, which can be slow. If this causes problems, we should ensure connectivity via peer
/// sets.
#[allow(dead_code)]
const DEFAULT_REQUEST_TIMEOUT: Duration = Duration::from_secs(3);
/// Request timeout where we can assume the connection is already open (e.g. we have peers in a
/// peer set as well).
const DEFAULT_REQUEST_TIMEOUT_CONNECTED: Duration = Duration::from_secs(1);
/// Timeout for requesting availability chunks.
pub const CHUNK_REQUEST_TIMEOUT: Duration = DEFAULT_REQUEST_TIMEOUT_CONNECTED;
/// This timeout is based on the following parameters, assuming we use asynchronous backing with no
/// time budget within a relay block:
/// - 500 Mbit/s networking speed
/// - 10 MB PoV
/// - 10 parallel executions
const POV_REQUEST_TIMEOUT_CONNECTED: Duration = Duration::from_millis(2000);
/// We want attested candidate requests to time out relatively fast,
/// because slow requests will bottleneck the backing system. Ideally, we'd have
/// an adaptive timeout based on the candidate size, because there will be a lot of variance
/// in candidate sizes: candidates with no code and no messages vs candidates with code
/// and messages.
///
/// We supply leniency because there are often large candidates and asynchronous
/// backing allows them to be included over a longer window of time. Exponential back-off
/// up to a maximum of 10 seconds would be ideal, but isn't supported by the
/// infrastructure here yet: see https://github.com/paritytech/polkadot/issues/6009
const ATTESTED_CANDIDATE_TIMEOUT: Duration = Duration::from_millis(2500);
/// We don't want a slow peer to slow down all the others, at the same time we want to get out the
/// data quickly in full to at least some peers (as this will reduce load on us as they then can
/// start serving the data). So this value is a tradeoff. 5 seems to be sensible. So we would need
/// to have 5 slow nodes connected, to delay transfer for others by `ATTESTED_CANDIDATE_TIMEOUT`.
pub const MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS: u32 = 5;
/// Response size limit for responses of POV like data.
///
/// Same as what we use in substrate networking.
const POV_RESPONSE_SIZE: u64 = MAX_RESPONSE_SIZE;
/// Maximum response sizes for `AttestedCandidateV2`.
///
/// This is `MAX_CODE_SIZE` plus some additional space for protocol overhead and
/// additional backing statements.
const ATTESTED_CANDIDATE_RESPONSE_SIZE: u64 = MAX_CODE_SIZE as u64 + 100_000;
/// We can have relative large timeouts here, there is no value of hitting a
/// timeout as we want to get statements through to each node in any case.
pub const DISPUTE_REQUEST_TIMEOUT: Duration = Duration::from_secs(12);
impl Protocol {
/// Get a configuration for a given Request response protocol.
///
/// Returns a `ProtocolConfig` for this protocol.
/// Use this if you plan only to send requests for this protocol.
pub fn get_outbound_only_config<B: Block, N: NetworkBackend<B, <B as Block>::Hash>>(
self,
req_protocol_names: &ReqProtocolNames,
) -> N::RequestResponseProtocolConfig {
self.create_config::<B, N>(req_protocol_names, None)
}
/// Get a configuration for a given Request response protocol.
///
/// Returns a receiver for messages received on this protocol and the requested
/// `ProtocolConfig`.
pub fn get_config<B: Block, N: NetworkBackend<B, <B as Block>::Hash>>(
self,
req_protocol_names: &ReqProtocolNames,
) -> (async_channel::Receiver<network::IncomingRequest>, N::RequestResponseProtocolConfig) {
let (tx, rx) = async_channel::bounded(self.get_channel_size());
let cfg = self.create_config::<B, N>(req_protocol_names, Some(tx));
(rx, cfg)
}
fn create_config<B: Block, N: NetworkBackend<B, <B as Block>::Hash>>(
self,
req_protocol_names: &ReqProtocolNames,
tx: Option<async_channel::Sender<network::IncomingRequest>>,
) -> N::RequestResponseProtocolConfig {
let name = req_protocol_names.get_name(self);
let legacy_names = self.get_legacy_name().into_iter().map(Into::into).collect();
match self {
Protocol::ChunkFetchingV1 | Protocol::ChunkFetchingV2 => N::request_response_config(
name,
legacy_names,
1_000,
POV_RESPONSE_SIZE,
// We are connected to all validators:
CHUNK_REQUEST_TIMEOUT,
tx,
),
Protocol::CollationFetchingV1 | Protocol::CollationFetchingV2 => {
N::request_response_config(
name,
legacy_names,
1_000,
POV_RESPONSE_SIZE,
// Taken from initial implementation in collator protocol:
POV_REQUEST_TIMEOUT_CONNECTED,
tx,
)
},
Protocol::PoVFetchingV1 => N::request_response_config(
name,
legacy_names,
1_000,
POV_RESPONSE_SIZE,
POV_REQUEST_TIMEOUT_CONNECTED,
tx,
),
Protocol::AvailableDataFetchingV1 => N::request_response_config(
name,
legacy_names,
1_000,
// Available data size is dominated by the PoV size.
POV_RESPONSE_SIZE,
POV_REQUEST_TIMEOUT_CONNECTED,
tx,
),
Protocol::DisputeSendingV1 => N::request_response_config(
name,
legacy_names,
1_000,
// Responses are just confirmation, in essence not even a bit. So 100 seems
// plenty.
100,
DISPUTE_REQUEST_TIMEOUT,
tx,
),
Protocol::AttestedCandidateV2 => N::request_response_config(
name,
legacy_names,
1_000,
ATTESTED_CANDIDATE_RESPONSE_SIZE,
ATTESTED_CANDIDATE_TIMEOUT,
tx,
),
}
}
// Channel sizes for the supported protocols.
fn get_channel_size(self) -> usize {
match self {
// Hundreds of validators will start requesting their chunks once they see a candidate
// awaiting availability on chain. Given that they will see that block at different
// times (due to network delays), 100 seems big enough to accommodate for "bursts",
// assuming we can service requests relatively quickly, which would need to be measured
// as well.
Protocol::ChunkFetchingV1 | Protocol::ChunkFetchingV2 => 100,
// 10 seems reasonable, considering group sizes of max 10 validators.
Protocol::CollationFetchingV1 | Protocol::CollationFetchingV2 => 10,
// 10 seems reasonable, considering group sizes of max 10 validators.
Protocol::PoVFetchingV1 => 10,
// Validators are constantly self-selecting to request available data which may lead
// to constant load and occasional burstiness.
Protocol::AvailableDataFetchingV1 => 100,
// Incoming requests can get bursty, we should also be able to handle them fast on
// average, so something in the ballpark of 100 should be fine. Nodes will retry on
// failure, so having a good value here is mostly about performance tuning.
Protocol::DisputeSendingV1 => 100,
Protocol::AttestedCandidateV2 => {
// We assume we can utilize up to 70% of the available bandwidth for statements.
// This is just a guess/estimate, with the following considerations: If we are
// faster than that, queue size will stay low anyway, even if not - requesters will
// get an immediate error, but if we are slower, requesters will run in a timeout -
// wasting precious time.
let available_bandwidth = 7 * MIN_BANDWIDTH_BYTES / 10;
let size = u64::saturating_sub(
ATTESTED_CANDIDATE_TIMEOUT.as_millis() as u64 * available_bandwidth /
(1000 * MAX_CODE_SIZE as u64),
MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS as u64,
);
debug_assert!(
size > 0,
"We should have a channel size greater zero, otherwise we won't accept any requests."
);
size as usize
},
}
}
/// Legacy protocol name associated with each peer set, if any.
/// The request will be tried on this legacy protocol name if the remote refuses to speak the
/// protocol.
const fn get_legacy_name(self) -> Option<&'static str> {
match self {
Protocol::ChunkFetchingV1 => Some("/pezkuwi/req_chunk/1"),
Protocol::CollationFetchingV1 => Some("/pezkuwi/req_collation/1"),
Protocol::PoVFetchingV1 => Some("/pezkuwi/req_pov/1"),
Protocol::AvailableDataFetchingV1 => Some("/pezkuwi/req_available_data/1"),
Protocol::DisputeSendingV1 => Some("/pezkuwi/send_dispute/1"),
// Introduced after legacy names became legacy.
Protocol::AttestedCandidateV2 => None,
Protocol::CollationFetchingV2 => None,
Protocol::ChunkFetchingV2 => None,
}
}
}
/// Common properties of any `Request`.
pub trait IsRequest {
/// Each request has a corresponding `Response`.
type Response;
/// What protocol this `Request` implements.
const PROTOCOL: Protocol;
}
/// Type for getting on the wire [`Protocol`] names using genesis hash & fork id.
#[derive(Clone)]
pub struct ReqProtocolNames {
names: HashMap<Protocol, ProtocolName>,
}
impl ReqProtocolNames {
/// Construct [`ReqProtocolNames`] from `genesis_hash` and `fork_id`.
pub fn new<Hash: AsRef<[u8]>>(genesis_hash: Hash, fork_id: Option<&str>) -> Self {
let mut names = HashMap::new();
for protocol in Protocol::iter() {
names.insert(protocol, Self::generate_name(protocol, &genesis_hash, fork_id));
}
Self { names }
}
/// Get on the wire [`Protocol`] name.
pub fn get_name(&self, protocol: Protocol) -> ProtocolName {
self.names
.get(&protocol)
.expect("All `Protocol` enum variants are added above via `strum`; qed")
.clone()
}
/// Protocol name of this protocol based on `genesis_hash` and `fork_id`.
fn generate_name<Hash: AsRef<[u8]>>(
protocol: Protocol,
genesis_hash: &Hash,
fork_id: Option<&str>,
) -> ProtocolName {
let prefix = if let Some(fork_id) = fork_id {
format!("/{}/{}", hex::encode(genesis_hash), fork_id)
} else {
format!("/{}", hex::encode(genesis_hash))
};
let short_name = match protocol {
// V1:
Protocol::ChunkFetchingV1 => "/req_chunk/1",
Protocol::CollationFetchingV1 => "/req_collation/1",
Protocol::PoVFetchingV1 => "/req_pov/1",
Protocol::AvailableDataFetchingV1 => "/req_available_data/1",
Protocol::DisputeSendingV1 => "/send_dispute/1",
// V2:
Protocol::CollationFetchingV2 => "/req_collation/2",
Protocol::AttestedCandidateV2 => "/req_attested_candidate/2",
Protocol::ChunkFetchingV2 => "/req_chunk/2",
};
format!("{}{}", prefix, short_name).into()
}
}
@@ -0,0 +1,205 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
use futures::{channel::oneshot, prelude::Future, FutureExt};
use codec::{Decode, Encode, Error as DecodingError};
use network::ProtocolName;
use sc_network as network;
use sc_network_types::PeerId;
use pezkuwi_primitives::AuthorityDiscoveryId;
use super::{v1, v2, IsRequest, Protocol};
/// All requests that can be sent to the network bridge via `NetworkBridgeTxMessage::SendRequest`.
#[derive(Debug)]
pub enum Requests {
/// Request an availability chunk from a node.
ChunkFetching(OutgoingRequest<v2::ChunkFetchingRequest, v1::ChunkFetchingRequest>),
/// Fetch a collation from a collator which previously announced it.
CollationFetchingV1(OutgoingRequest<v1::CollationFetchingRequest>),
/// Fetch a PoV from a validator which previously sent out a seconded statement.
PoVFetchingV1(OutgoingRequest<v1::PoVFetchingRequest>),
/// Request full available data from a node.
AvailableDataFetchingV1(OutgoingRequest<v1::AvailableDataFetchingRequest>),
/// Requests for notifying about an ongoing dispute.
DisputeSendingV1(OutgoingRequest<v1::DisputeRequest>),
/// Request a candidate and attestations.
AttestedCandidateV2(OutgoingRequest<v2::AttestedCandidateRequest>),
/// Fetch a collation from a collator which previously announced it.
/// Compared to V1 it requires specifying which candidate is requested by its hash.
CollationFetchingV2(OutgoingRequest<v2::CollationFetchingRequest>),
}
impl Requests {
/// Encode the request.
///
/// The corresponding protocol is returned as well, as we are now leaving typed territory.
///
/// Note: `Requests` is just an enum collecting all supported requests supported by network
/// bridge, it is never sent over the wire. This function just encodes the individual requests
/// contained in the `enum`.
pub fn encode_request(self) -> (Protocol, OutgoingRequest<Vec<u8>>) {
match self {
Self::ChunkFetching(r) => r.encode_request(),
Self::CollationFetchingV1(r) => r.encode_request(),
Self::CollationFetchingV2(r) => r.encode_request(),
Self::PoVFetchingV1(r) => r.encode_request(),
Self::AvailableDataFetchingV1(r) => r.encode_request(),
Self::DisputeSendingV1(r) => r.encode_request(),
Self::AttestedCandidateV2(r) => r.encode_request(),
}
}
}
/// Used by the network to send us a response to a request.
pub type ResponseSender = oneshot::Sender<Result<(Vec<u8>, ProtocolName), network::RequestFailure>>;
/// Any error that can occur when sending a request.
#[derive(Debug, thiserror::Error)]
pub enum RequestError {
/// Response could not be decoded.
#[error("Response could not be decoded: {0}")]
InvalidResponse(#[from] DecodingError),
/// Some error in substrate/libp2p happened.
#[error("{0}")]
NetworkError(#[from] network::RequestFailure),
/// Response got canceled by networking.
#[error("Response channel got canceled")]
Canceled(#[from] oneshot::Canceled),
}
impl RequestError {
/// Whether the error represents some kind of timeout condition.
pub fn is_timed_out(&self) -> bool {
match self {
Self::Canceled(_) |
Self::NetworkError(network::RequestFailure::Obsolete) |
Self::NetworkError(network::RequestFailure::Network(
network::OutboundFailure::Timeout,
)) => true,
_ => false,
}
}
}
/// A request to be sent to the network bridge, including a sender for sending responses/failures.
///
/// The network implementation will make use of that sender for informing the requesting subsystem
/// about responses/errors.
///
/// When using `Recipient::Peer`, keep in mind that no address (as in IP address and port) might
/// be known for that specific peer. You are encouraged to use `Peer` for peers that you are
/// expected to be already connected to.
/// When using `Recipient::Authority`, the addresses can be found thanks to the authority
/// discovery system.
#[derive(Debug)]
pub struct OutgoingRequest<Req, FallbackReq = Req> {
/// Intended recipient of this request.
pub peer: Recipient,
/// The actual request to send over the wire.
pub payload: Req,
/// Optional fallback request and protocol.
pub fallback_request: Option<(FallbackReq, Protocol)>,
/// Sender which is used by networking to get us back a response.
pub pending_response: ResponseSender,
}
/// Potential recipients of an outgoing request.
#[derive(Debug, Eq, Hash, PartialEq, Clone)]
pub enum Recipient {
/// Recipient is a regular peer and we know its peer id.
Peer(PeerId),
/// Recipient is a validator, we address it via this `AuthorityDiscoveryId`.
Authority(AuthorityDiscoveryId),
}
/// Responses received for an `OutgoingRequest`.
pub type OutgoingResult<Res> = Result<Res, RequestError>;
impl<Req, FallbackReq> OutgoingRequest<Req, FallbackReq>
where
Req: IsRequest + Encode,
Req::Response: Decode,
FallbackReq: IsRequest + Encode,
FallbackReq::Response: Decode,
{
/// Create a new `OutgoingRequest`.
///
/// It will contain a sender that is used by the networking for sending back responses. The
/// connected receiver is returned as the second element in the returned tuple.
pub fn new(
peer: Recipient,
payload: Req,
) -> (Self, impl Future<Output = OutgoingResult<Req::Response>>) {
let (tx, rx) = oneshot::channel();
let r = Self { peer, payload, pending_response: tx, fallback_request: None };
(r, receive_response::<Req>(rx.map(|r| r.map(|r| r.map(|(resp, _)| resp)))))
}
/// Create a new `OutgoingRequest` with a fallback in case the remote does not support this
/// protocol. Useful when adding a new version of a req-response protocol, to achieve
/// compatibility with the older version.
///
/// Returns a raw `Vec<u8>` response over the channel. Use the associated `ProtocolName` to know
/// which request was the successful one and appropriately decode the response.
pub fn new_with_fallback(
peer: Recipient,
payload: Req,
fallback_request: FallbackReq,
) -> (Self, impl Future<Output = OutgoingResult<(Vec<u8>, ProtocolName)>>) {
let (tx, rx) = oneshot::channel();
let r = Self {
peer,
payload,
pending_response: tx,
fallback_request: Some((fallback_request, FallbackReq::PROTOCOL)),
};
(r, async { Ok(rx.await??) })
}
/// Encode a request into a `Vec<u8>`.
///
/// As this throws away type information, we also return the `Protocol` this encoded request
/// adheres to.
pub fn encode_request(self) -> (Protocol, OutgoingRequest<Vec<u8>>) {
let OutgoingRequest { peer, payload, pending_response, fallback_request } = self;
let encoded = OutgoingRequest {
peer,
payload: payload.encode(),
fallback_request: fallback_request.map(|(r, p)| (r.encode(), p)),
pending_response,
};
(Req::PROTOCOL, encoded)
}
}
/// Future for actually receiving a typed response for an `OutgoingRequest`.
async fn receive_response<Req>(
rec: impl Future<Output = Result<Result<Vec<u8>, network::RequestFailure>, oneshot::Canceled>>,
) -> OutgoingResult<Req::Response>
where
Req: IsRequest,
Req::Response: Decode,
{
let raw = rec.await??;
Ok(Decode::decode(&mut raw.as_ref())?)
}
@@ -0,0 +1,214 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Pezkuwi.
// Pezkuwi is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Pezkuwi is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
//! Requests and responses as sent over the wire for the individual protocols.
use codec::{Decode, Encode};
use pezkuwi_node_primitives::{
AvailableData, DisputeMessage, ErasureChunk, PoV, Proof, UncheckedDisputeMessage,
};
use pezkuwi_primitives::{
CandidateHash, CandidateReceiptV2 as CandidateReceipt, Hash, HeadData, Id as ParaId,
ValidatorIndex,
};
use super::{IsRequest, Protocol};
/// Request an availability chunk.
#[derive(Debug, Copy, Clone, Encode, Decode)]
pub struct ChunkFetchingRequest {
/// Hash of candidate we want a chunk for.
pub candidate_hash: CandidateHash,
/// The validator index we are requesting from. This must be identical to the index of the
/// chunk we'll receive. For v2, this may not be the case.
pub index: ValidatorIndex,
}
/// Receive a requested erasure chunk.
#[derive(Debug, Clone, Encode, Decode)]
pub enum ChunkFetchingResponse {
/// The requested chunk data.
#[codec(index = 0)]
Chunk(ChunkResponse),
/// Node was not in possession of the requested chunk.
#[codec(index = 1)]
NoSuchChunk,
}
impl From<Option<ChunkResponse>> for ChunkFetchingResponse {
fn from(x: Option<ChunkResponse>) -> Self {
match x {
Some(c) => ChunkFetchingResponse::Chunk(c),
None => ChunkFetchingResponse::NoSuchChunk,
}
}
}
impl From<ChunkFetchingResponse> for Option<ChunkResponse> {
fn from(x: ChunkFetchingResponse) -> Self {
match x {
ChunkFetchingResponse::Chunk(c) => Some(c),
ChunkFetchingResponse::NoSuchChunk => None,
}
}
}
/// Skimmed down variant of `ErasureChunk`.
///
/// Instead of transmitting a full `ErasureChunk` we transmit `ChunkResponse` in
/// `ChunkFetchingResponse`, which omits the chunk's index. The index is already known by
/// the requester and by not transmitting it, we ensure the requester is going to use his index
/// value for validating the response, thus making sure he got what he requested.
#[derive(Debug, Clone, Encode, Decode)]
pub struct ChunkResponse {
/// The erasure-encoded chunk of data belonging to the candidate block.
pub chunk: Vec<u8>,
/// Proof for this chunk's branch in the Merkle tree.
pub proof: Proof,
}
impl From<ErasureChunk> for ChunkResponse {
fn from(ErasureChunk { chunk, index: _, proof }: ErasureChunk) -> Self {
ChunkResponse { chunk, proof }
}
}
impl ChunkResponse {
/// Re-build an `ErasureChunk` from response and request.
pub fn recombine_into_chunk(self, req: &ChunkFetchingRequest) -> ErasureChunk {
ErasureChunk { chunk: self.chunk, proof: self.proof, index: req.index.into() }
}
}
impl IsRequest for ChunkFetchingRequest {
type Response = ChunkFetchingResponse;
const PROTOCOL: Protocol = Protocol::ChunkFetchingV1;
}
/// Request the advertised collation at that relay-parent.
#[derive(Debug, Clone, Encode, Decode)]
pub struct CollationFetchingRequest {
/// Relay parent we want a collation for.
pub relay_parent: Hash,
/// The `ParaId` of the collation.
pub para_id: ParaId,
}
/// Responses as sent by collators.
#[derive(Debug, Clone, Encode, Decode)]
pub enum CollationFetchingResponse {
/// Deliver requested collation.
#[codec(index = 0)]
Collation(CandidateReceipt, PoV),
/// Deliver requested collation along with parent head data.
#[codec(index = 1)]
CollationWithParentHeadData {
/// The receipt of the candidate.
receipt: CandidateReceipt,
/// Candidate's proof of validity.
pov: PoV,
/// The head data of the candidate's parent.
/// This is needed for elastic scaling to work.
parent_head_data: HeadData,
},
}
impl IsRequest for CollationFetchingRequest {
type Response = CollationFetchingResponse;
const PROTOCOL: Protocol = Protocol::CollationFetchingV1;
}
/// Request the advertised collation at that relay-parent.
#[derive(Debug, Clone, Encode, Decode)]
pub struct PoVFetchingRequest {
/// Candidate we want a PoV for.
pub candidate_hash: CandidateHash,
}
/// Responses to `PoVFetchingRequest`.
#[derive(Debug, Clone, Encode, Decode)]
pub enum PoVFetchingResponse {
/// Deliver requested PoV.
#[codec(index = 0)]
PoV(PoV),
/// PoV was not found in store.
#[codec(index = 1)]
NoSuchPoV,
}
impl IsRequest for PoVFetchingRequest {
type Response = PoVFetchingResponse;
const PROTOCOL: Protocol = Protocol::PoVFetchingV1;
}
/// Request the entire available data for a candidate.
#[derive(Debug, Clone, Encode, Decode)]
pub struct AvailableDataFetchingRequest {
/// The candidate hash to get the available data for.
pub candidate_hash: CandidateHash,
}
/// Receive a requested available data.
#[derive(Debug, Clone, Encode, Decode)]
pub enum AvailableDataFetchingResponse {
/// The requested data.
#[codec(index = 0)]
AvailableData(AvailableData),
/// Node was not in possession of the requested data.
#[codec(index = 1)]
NoSuchData,
}
impl From<Option<AvailableData>> for AvailableDataFetchingResponse {
fn from(x: Option<AvailableData>) -> Self {
match x {
Some(data) => AvailableDataFetchingResponse::AvailableData(data),
None => AvailableDataFetchingResponse::NoSuchData,
}
}
}
impl IsRequest for AvailableDataFetchingRequest {
type Response = AvailableDataFetchingResponse;
const PROTOCOL: Protocol = Protocol::AvailableDataFetchingV1;
}
/// A dispute request.
///
/// Contains an invalid vote a valid one for a particular candidate in a given session.
#[derive(Clone, Encode, Decode, Debug)]
pub struct DisputeRequest(pub UncheckedDisputeMessage);
impl From<DisputeMessage> for DisputeRequest {
fn from(msg: DisputeMessage) -> Self {
Self(msg.into())
}
}
/// Possible responses to a `DisputeRequest`.
#[derive(Encode, Decode, Debug, PartialEq, Eq)]
pub enum DisputeResponse {
/// Recipient successfully processed the dispute request.
#[codec(index = 0)]
Confirmed,
}
impl IsRequest for DisputeRequest {
type Response = DisputeResponse;
const PROTOCOL: Protocol = Protocol::DisputeSendingV1;
}

Some files were not shown because too many files have changed in this diff Show More