feat: initialize Kurdistan SDK - independent fork of Polkadot SDK
This commit is contained in:
@@ -0,0 +1,197 @@
|
||||
// Copyright (C) Parity Technologies (UK) Ltd.
|
||||
// This file is part of Pezkuwi.
|
||||
|
||||
// Pezkuwi is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Pezkuwi is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//! Main recovery task logic. Runs recovery strategies.
|
||||
|
||||
#![warn(missing_docs)]
|
||||
|
||||
mod strategy;
|
||||
|
||||
pub use self::strategy::{
|
||||
FetchChunks, FetchChunksParams, FetchFull, FetchFullParams, FetchSystematicChunks,
|
||||
FetchSystematicChunksParams, RecoveryStrategy, State,
|
||||
};
|
||||
|
||||
#[cfg(test)]
|
||||
pub use self::strategy::{REGULAR_CHUNKS_REQ_RETRY_LIMIT, SYSTEMATIC_CHUNKS_REQ_RETRY_LIMIT};
|
||||
|
||||
use crate::{metrics::Metrics, ErasureTask, PostRecoveryCheck, LOG_TARGET};
|
||||
|
||||
use codec::Encode;
|
||||
use pezkuwi_node_primitives::AvailableData;
|
||||
use pezkuwi_node_subsystem::{messages::AvailabilityStoreMessage, overseer, RecoveryError};
|
||||
use pezkuwi_primitives::{AuthorityDiscoveryId, CandidateHash, Hash};
|
||||
use sc_network::ProtocolName;
|
||||
|
||||
use futures::channel::{mpsc, oneshot};
|
||||
use std::collections::VecDeque;
|
||||
|
||||
/// Recovery parameters common to all strategies in a `RecoveryTask`.
|
||||
#[derive(Clone)]
|
||||
pub struct RecoveryParams {
|
||||
/// Discovery ids of `validators`.
|
||||
pub validator_authority_keys: Vec<AuthorityDiscoveryId>,
|
||||
|
||||
/// Number of validators.
|
||||
pub n_validators: usize,
|
||||
|
||||
/// The number of regular chunks needed.
|
||||
pub threshold: usize,
|
||||
|
||||
/// The number of systematic chunks needed.
|
||||
pub systematic_threshold: usize,
|
||||
|
||||
/// A hash of the relevant candidate.
|
||||
pub candidate_hash: CandidateHash,
|
||||
|
||||
/// The root of the erasure encoding of the candidate.
|
||||
pub erasure_root: Hash,
|
||||
|
||||
/// Metrics to report.
|
||||
pub metrics: Metrics,
|
||||
|
||||
/// Do not request data from availability-store. Useful for collators.
|
||||
pub bypass_availability_store: bool,
|
||||
|
||||
/// The type of check to perform after available data was recovered.
|
||||
pub post_recovery_check: PostRecoveryCheck,
|
||||
|
||||
/// The blake2-256 hash of the PoV.
|
||||
pub pov_hash: Hash,
|
||||
|
||||
/// Protocol name for ChunkFetchingV1.
|
||||
pub req_v1_protocol_name: ProtocolName,
|
||||
|
||||
/// Protocol name for ChunkFetchingV2.
|
||||
pub req_v2_protocol_name: ProtocolName,
|
||||
|
||||
/// Whether or not chunk mapping is enabled.
|
||||
pub chunk_mapping_enabled: bool,
|
||||
|
||||
/// Channel to the erasure task handler.
|
||||
pub erasure_task_tx: mpsc::Sender<ErasureTask>,
|
||||
}
|
||||
|
||||
/// A stateful reconstruction of availability data in reference to
|
||||
/// a candidate hash.
|
||||
pub struct RecoveryTask<Sender: overseer::AvailabilityRecoverySenderTrait> {
|
||||
sender: Sender,
|
||||
params: RecoveryParams,
|
||||
strategies: VecDeque<Box<dyn RecoveryStrategy<Sender>>>,
|
||||
state: State,
|
||||
}
|
||||
|
||||
impl<Sender> RecoveryTask<Sender>
|
||||
where
|
||||
Sender: overseer::AvailabilityRecoverySenderTrait,
|
||||
{
|
||||
/// Instantiate a new recovery task.
|
||||
pub fn new(
|
||||
sender: Sender,
|
||||
params: RecoveryParams,
|
||||
strategies: VecDeque<Box<dyn RecoveryStrategy<Sender>>>,
|
||||
) -> Self {
|
||||
Self { sender, params, strategies, state: State::new() }
|
||||
}
|
||||
|
||||
async fn in_availability_store(&mut self) -> Option<AvailableData> {
|
||||
if !self.params.bypass_availability_store {
|
||||
let (tx, rx) = oneshot::channel();
|
||||
self.sender
|
||||
.send_message(AvailabilityStoreMessage::QueryAvailableData(
|
||||
self.params.candidate_hash,
|
||||
tx,
|
||||
))
|
||||
.await;
|
||||
|
||||
match rx.await {
|
||||
Ok(Some(data)) => return Some(data),
|
||||
Ok(None) => {},
|
||||
Err(oneshot::Canceled) => {
|
||||
gum::warn!(
|
||||
target: LOG_TARGET,
|
||||
candidate_hash = ?self.params.candidate_hash,
|
||||
"Failed to reach the availability store",
|
||||
)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// Run this recovery task to completion. It will loop through the configured strategies
|
||||
/// in-order and return whenever the first one recovers the full `AvailableData`.
|
||||
pub async fn run(mut self) -> Result<AvailableData, RecoveryError> {
|
||||
if let Some(data) = self.in_availability_store().await {
|
||||
return Ok(data);
|
||||
}
|
||||
|
||||
self.params.metrics.on_recovery_started();
|
||||
|
||||
let _timer = self.params.metrics.time_full_recovery();
|
||||
|
||||
while let Some(current_strategy) = self.strategies.pop_front() {
|
||||
let display_name = current_strategy.display_name();
|
||||
let strategy_type = current_strategy.strategy_type();
|
||||
|
||||
gum::debug!(
|
||||
target: LOG_TARGET,
|
||||
candidate_hash = ?self.params.candidate_hash,
|
||||
"Starting `{}` strategy",
|
||||
display_name
|
||||
);
|
||||
|
||||
let res = current_strategy.run(&mut self.state, &mut self.sender, &self.params).await;
|
||||
|
||||
match res {
|
||||
Err(RecoveryError::Unavailable) =>
|
||||
if self.strategies.front().is_some() {
|
||||
gum::debug!(
|
||||
target: LOG_TARGET,
|
||||
candidate_hash = ?self.params.candidate_hash,
|
||||
"Recovery strategy `{}` did not conclude. Trying the next one.",
|
||||
display_name
|
||||
);
|
||||
continue;
|
||||
},
|
||||
Err(err) => {
|
||||
match &err {
|
||||
RecoveryError::Invalid =>
|
||||
self.params.metrics.on_recovery_invalid(strategy_type),
|
||||
_ => self.params.metrics.on_recovery_failed(strategy_type),
|
||||
}
|
||||
return Err(err);
|
||||
},
|
||||
Ok(data) => {
|
||||
self.params.metrics.on_recovery_succeeded(strategy_type, data.encoded_size());
|
||||
return Ok(data);
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// We have no other strategies to try.
|
||||
gum::warn!(
|
||||
target: LOG_TARGET,
|
||||
candidate_hash = ?self.params.candidate_hash,
|
||||
"Recovery of available data failed.",
|
||||
);
|
||||
|
||||
self.params.metrics.on_recovery_failed("all");
|
||||
|
||||
Err(RecoveryError::Unavailable)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,334 @@
|
||||
// Copyright (C) Parity Technologies (UK) Ltd.
|
||||
// This file is part of Pezkuwi.
|
||||
|
||||
// Pezkuwi is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Pezkuwi is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
use crate::{
|
||||
futures_undead::FuturesUndead,
|
||||
task::{
|
||||
strategy::{
|
||||
do_post_recovery_check, is_unavailable, OngoingRequests, N_PARALLEL,
|
||||
REGULAR_CHUNKS_REQ_RETRY_LIMIT,
|
||||
},
|
||||
RecoveryParams, State,
|
||||
},
|
||||
ErasureTask, RecoveryStrategy, LOG_TARGET,
|
||||
};
|
||||
|
||||
use pezkuwi_node_primitives::AvailableData;
|
||||
use pezkuwi_node_subsystem::{overseer, RecoveryError};
|
||||
use pezkuwi_primitives::ValidatorIndex;
|
||||
|
||||
use futures::{channel::oneshot, SinkExt};
|
||||
use rand::seq::SliceRandom;
|
||||
use std::collections::VecDeque;
|
||||
|
||||
/// Parameters specific to the `FetchChunks` strategy.
|
||||
pub struct FetchChunksParams {
|
||||
pub n_validators: usize,
|
||||
}
|
||||
|
||||
/// `RecoveryStrategy` that requests chunks from validators, in parallel.
|
||||
pub struct FetchChunks {
|
||||
/// How many requests have been unsuccessful so far.
|
||||
error_count: usize,
|
||||
/// Total number of responses that have been received, including failed ones.
|
||||
total_received_responses: usize,
|
||||
/// A shuffled array of validator indices.
|
||||
validators: VecDeque<ValidatorIndex>,
|
||||
/// Collection of in-flight requests.
|
||||
requesting_chunks: OngoingRequests,
|
||||
}
|
||||
|
||||
impl FetchChunks {
|
||||
/// Instantiate a new strategy.
|
||||
pub fn new(params: FetchChunksParams) -> Self {
|
||||
// Shuffle the validators to make sure that we don't request chunks from the same
|
||||
// validators over and over.
|
||||
let mut validators: VecDeque<ValidatorIndex> =
|
||||
(0..params.n_validators).map(|i| ValidatorIndex(i as u32)).collect();
|
||||
validators.make_contiguous().shuffle(&mut rand::thread_rng());
|
||||
|
||||
Self {
|
||||
error_count: 0,
|
||||
total_received_responses: 0,
|
||||
validators,
|
||||
requesting_chunks: FuturesUndead::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn is_unavailable(
|
||||
unrequested_validators: usize,
|
||||
in_flight_requests: usize,
|
||||
chunk_count: usize,
|
||||
threshold: usize,
|
||||
) -> bool {
|
||||
is_unavailable(chunk_count, in_flight_requests, unrequested_validators, threshold)
|
||||
}
|
||||
|
||||
/// Desired number of parallel requests.
|
||||
///
|
||||
/// For the given threshold (total required number of chunks) get the desired number of
|
||||
/// requests we want to have running in parallel at this time.
|
||||
fn get_desired_request_count(&self, chunk_count: usize, threshold: usize) -> usize {
|
||||
// Upper bound for parallel requests.
|
||||
// We want to limit this, so requests can be processed within the timeout and we limit the
|
||||
// following feedback loop:
|
||||
// 1. Requests fail due to timeout
|
||||
// 2. We request more chunks to make up for it
|
||||
// 3. Bandwidth is spread out even more, so we get even more timeouts
|
||||
// 4. We request more chunks to make up for it ...
|
||||
let max_requests_boundary = std::cmp::min(N_PARALLEL, threshold);
|
||||
// How many chunks are still needed?
|
||||
let remaining_chunks = threshold.saturating_sub(chunk_count);
|
||||
// What is the current error rate, so we can make up for it?
|
||||
let inv_error_rate =
|
||||
self.total_received_responses.checked_div(self.error_count).unwrap_or(0);
|
||||
// Actual number of requests we want to have in flight in parallel:
|
||||
std::cmp::min(
|
||||
max_requests_boundary,
|
||||
remaining_chunks + remaining_chunks.checked_div(inv_error_rate).unwrap_or(0),
|
||||
)
|
||||
}
|
||||
|
||||
async fn attempt_recovery<Sender: overseer::AvailabilityRecoverySenderTrait>(
|
||||
&mut self,
|
||||
state: &mut State,
|
||||
common_params: &RecoveryParams,
|
||||
) -> Result<AvailableData, RecoveryError> {
|
||||
let recovery_duration =
|
||||
common_params
|
||||
.metrics
|
||||
.time_erasure_recovery(RecoveryStrategy::<Sender>::strategy_type(self));
|
||||
|
||||
// Send request to reconstruct available data from chunks.
|
||||
let (avilable_data_tx, available_data_rx) = oneshot::channel();
|
||||
|
||||
let mut erasure_task_tx = common_params.erasure_task_tx.clone();
|
||||
erasure_task_tx
|
||||
.send(ErasureTask::Reconstruct(
|
||||
common_params.n_validators,
|
||||
// Safe to leave an empty vec in place, as we're stopping the recovery process if
|
||||
// this reconstruct fails.
|
||||
std::mem::take(&mut state.received_chunks)
|
||||
.into_iter()
|
||||
.map(|(c_index, chunk)| (c_index, chunk.chunk))
|
||||
.collect(),
|
||||
avilable_data_tx,
|
||||
))
|
||||
.await
|
||||
.map_err(|_| RecoveryError::ChannelClosed)?;
|
||||
|
||||
let available_data_response =
|
||||
available_data_rx.await.map_err(|_| RecoveryError::ChannelClosed)?;
|
||||
|
||||
match available_data_response {
|
||||
// Attempt post-recovery check.
|
||||
Ok(data) => do_post_recovery_check(common_params, data)
|
||||
.await
|
||||
.inspect_err(|_| {
|
||||
recovery_duration.map(|rd| rd.stop_and_discard());
|
||||
})
|
||||
.inspect(|_| {
|
||||
gum::trace!(
|
||||
target: LOG_TARGET,
|
||||
candidate_hash = ?common_params.candidate_hash,
|
||||
erasure_root = ?common_params.erasure_root,
|
||||
"Data recovery from chunks complete",
|
||||
);
|
||||
}),
|
||||
Err(err) => {
|
||||
recovery_duration.map(|rd| rd.stop_and_discard());
|
||||
gum::debug!(
|
||||
target: LOG_TARGET,
|
||||
candidate_hash = ?common_params.candidate_hash,
|
||||
erasure_root = ?common_params.erasure_root,
|
||||
?err,
|
||||
"Data recovery error",
|
||||
);
|
||||
|
||||
Err(RecoveryError::Invalid)
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl<Sender: overseer::AvailabilityRecoverySenderTrait> RecoveryStrategy<Sender> for FetchChunks {
|
||||
fn display_name(&self) -> &'static str {
|
||||
"Fetch chunks"
|
||||
}
|
||||
|
||||
fn strategy_type(&self) -> &'static str {
|
||||
"regular_chunks"
|
||||
}
|
||||
|
||||
async fn run(
|
||||
mut self: Box<Self>,
|
||||
state: &mut State,
|
||||
sender: &mut Sender,
|
||||
common_params: &RecoveryParams,
|
||||
) -> Result<AvailableData, RecoveryError> {
|
||||
// First query the store for any chunks we've got.
|
||||
if !common_params.bypass_availability_store {
|
||||
let local_chunk_indices = state.populate_from_av_store(common_params, sender).await;
|
||||
self.validators.retain(|validator_index| {
|
||||
!local_chunk_indices.iter().any(|(v_index, _)| v_index == validator_index)
|
||||
});
|
||||
}
|
||||
|
||||
// No need to query the validators that have the chunks we already received or that we know
|
||||
// don't have the data from previous strategies.
|
||||
self.validators.retain(|v_index| {
|
||||
!state.received_chunks.values().any(|c| v_index == &c.validator_index) &&
|
||||
state.can_retry_request(
|
||||
&(common_params.validator_authority_keys[v_index.0 as usize].clone(), *v_index),
|
||||
REGULAR_CHUNKS_REQ_RETRY_LIMIT,
|
||||
)
|
||||
});
|
||||
|
||||
// Safe to `take` here, as we're consuming `self` anyway and we're not using the
|
||||
// `validators` field in other methods.
|
||||
let mut validators_queue: VecDeque<_> = std::mem::take(&mut self.validators)
|
||||
.into_iter()
|
||||
.map(|validator_index| {
|
||||
(
|
||||
common_params.validator_authority_keys[validator_index.0 as usize].clone(),
|
||||
validator_index,
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
loop {
|
||||
// If received_chunks has more than threshold entries, attempt to recover the data.
|
||||
// If that fails, or a re-encoding of it doesn't match the expected erasure root,
|
||||
// return Err(RecoveryError::Invalid).
|
||||
// Do this before requesting any chunks because we may have enough of them coming from
|
||||
// past RecoveryStrategies.
|
||||
if state.chunk_count() >= common_params.threshold {
|
||||
return self.attempt_recovery::<Sender>(state, common_params).await;
|
||||
}
|
||||
|
||||
if Self::is_unavailable(
|
||||
validators_queue.len(),
|
||||
self.requesting_chunks.total_len(),
|
||||
state.chunk_count(),
|
||||
common_params.threshold,
|
||||
) {
|
||||
gum::debug!(
|
||||
target: LOG_TARGET,
|
||||
candidate_hash = ?common_params.candidate_hash,
|
||||
erasure_root = ?common_params.erasure_root,
|
||||
received = %state.chunk_count(),
|
||||
requesting = %self.requesting_chunks.len(),
|
||||
total_requesting = %self.requesting_chunks.total_len(),
|
||||
n_validators = %common_params.n_validators,
|
||||
"Data recovery from chunks is not possible",
|
||||
);
|
||||
|
||||
return Err(RecoveryError::Unavailable);
|
||||
}
|
||||
|
||||
let desired_requests_count =
|
||||
self.get_desired_request_count(state.chunk_count(), common_params.threshold);
|
||||
let already_requesting_count = self.requesting_chunks.len();
|
||||
gum::debug!(
|
||||
target: LOG_TARGET,
|
||||
?common_params.candidate_hash,
|
||||
?desired_requests_count,
|
||||
error_count= ?self.error_count,
|
||||
total_received = ?self.total_received_responses,
|
||||
threshold = ?common_params.threshold,
|
||||
?already_requesting_count,
|
||||
"Requesting availability chunks for a candidate",
|
||||
);
|
||||
|
||||
let strategy_type = RecoveryStrategy::<Sender>::strategy_type(&*self);
|
||||
|
||||
state
|
||||
.launch_parallel_chunk_requests(
|
||||
strategy_type,
|
||||
common_params,
|
||||
sender,
|
||||
desired_requests_count,
|
||||
&mut validators_queue,
|
||||
&mut self.requesting_chunks,
|
||||
)
|
||||
.await;
|
||||
|
||||
let (total_responses, error_count) = state
|
||||
.wait_for_chunks(
|
||||
strategy_type,
|
||||
common_params,
|
||||
REGULAR_CHUNKS_REQ_RETRY_LIMIT,
|
||||
&mut validators_queue,
|
||||
&mut self.requesting_chunks,
|
||||
&mut vec![],
|
||||
|unrequested_validators,
|
||||
in_flight_reqs,
|
||||
chunk_count,
|
||||
_systematic_chunk_count| {
|
||||
chunk_count >= common_params.threshold ||
|
||||
Self::is_unavailable(
|
||||
unrequested_validators,
|
||||
in_flight_reqs,
|
||||
chunk_count,
|
||||
common_params.threshold,
|
||||
)
|
||||
},
|
||||
)
|
||||
.await;
|
||||
|
||||
self.total_received_responses += total_responses;
|
||||
self.error_count += error_count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use pezkuwi_erasure_coding::recovery_threshold;
|
||||
|
||||
#[test]
|
||||
fn test_get_desired_request_count() {
|
||||
let n_validators = 100;
|
||||
let threshold = recovery_threshold(n_validators).unwrap();
|
||||
|
||||
let mut fetch_chunks_task = FetchChunks::new(FetchChunksParams { n_validators });
|
||||
assert_eq!(fetch_chunks_task.get_desired_request_count(0, threshold), threshold);
|
||||
fetch_chunks_task.error_count = 1;
|
||||
fetch_chunks_task.total_received_responses = 1;
|
||||
// We saturate at threshold (34):
|
||||
assert_eq!(fetch_chunks_task.get_desired_request_count(0, threshold), threshold);
|
||||
|
||||
// We saturate at the parallel limit.
|
||||
assert_eq!(fetch_chunks_task.get_desired_request_count(0, N_PARALLEL + 2), N_PARALLEL);
|
||||
|
||||
fetch_chunks_task.total_received_responses = 2;
|
||||
// With given error rate - still saturating:
|
||||
assert_eq!(fetch_chunks_task.get_desired_request_count(1, threshold), threshold);
|
||||
fetch_chunks_task.total_received_responses = 10;
|
||||
// error rate: 1/10
|
||||
// remaining chunks needed: threshold (34) - 9
|
||||
// expected: 24 * (1+ 1/10) = (next greater integer) = 27
|
||||
assert_eq!(fetch_chunks_task.get_desired_request_count(9, threshold), 27);
|
||||
// We saturate at the parallel limit.
|
||||
assert_eq!(fetch_chunks_task.get_desired_request_count(9, N_PARALLEL + 9), N_PARALLEL);
|
||||
|
||||
fetch_chunks_task.error_count = 0;
|
||||
// With error count zero - we should fetch exactly as needed:
|
||||
assert_eq!(fetch_chunks_task.get_desired_request_count(10, threshold), threshold - 10);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,174 @@
|
||||
// Copyright (C) Parity Technologies (UK) Ltd.
|
||||
// This file is part of Pezkuwi.
|
||||
|
||||
// Pezkuwi is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Pezkuwi is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
use crate::{
|
||||
task::{RecoveryParams, RecoveryStrategy, State},
|
||||
ErasureTask, PostRecoveryCheck, LOG_TARGET,
|
||||
};
|
||||
|
||||
use pezkuwi_node_network_protocol::request_response::{
|
||||
self as req_res, outgoing::RequestError, OutgoingRequest, Recipient, Requests,
|
||||
};
|
||||
use pezkuwi_node_primitives::AvailableData;
|
||||
use pezkuwi_node_subsystem::{messages::NetworkBridgeTxMessage, overseer, RecoveryError};
|
||||
use pezkuwi_primitives::ValidatorIndex;
|
||||
use sc_network::{IfDisconnected, OutboundFailure, RequestFailure};
|
||||
|
||||
use futures::{channel::oneshot, SinkExt};
|
||||
use rand::seq::SliceRandom;
|
||||
|
||||
/// Parameters specific to the `FetchFull` strategy.
|
||||
pub struct FetchFullParams {
|
||||
/// Validators that will be used for fetching the data.
|
||||
pub validators: Vec<ValidatorIndex>,
|
||||
}
|
||||
|
||||
/// `RecoveryStrategy` that sequentially tries to fetch the full `AvailableData` from
|
||||
/// already-connected validators in the configured validator set.
|
||||
pub struct FetchFull {
|
||||
params: FetchFullParams,
|
||||
}
|
||||
|
||||
impl FetchFull {
|
||||
/// Create a new `FetchFull` recovery strategy.
|
||||
pub fn new(mut params: FetchFullParams) -> Self {
|
||||
params.validators.shuffle(&mut rand::thread_rng());
|
||||
Self { params }
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl<Sender: overseer::AvailabilityRecoverySenderTrait> RecoveryStrategy<Sender> for FetchFull {
|
||||
fn display_name(&self) -> &'static str {
|
||||
"Full recovery from backers"
|
||||
}
|
||||
|
||||
fn strategy_type(&self) -> &'static str {
|
||||
"full_from_backers"
|
||||
}
|
||||
|
||||
async fn run(
|
||||
mut self: Box<Self>,
|
||||
_: &mut State,
|
||||
sender: &mut Sender,
|
||||
common_params: &RecoveryParams,
|
||||
) -> Result<AvailableData, RecoveryError> {
|
||||
let strategy_type = RecoveryStrategy::<Sender>::strategy_type(&*self);
|
||||
|
||||
loop {
|
||||
// Pop the next validator.
|
||||
let validator_index =
|
||||
self.params.validators.pop().ok_or_else(|| RecoveryError::Unavailable)?;
|
||||
|
||||
// Request data.
|
||||
let (req, response) = OutgoingRequest::new(
|
||||
Recipient::Authority(
|
||||
common_params.validator_authority_keys[validator_index.0 as usize].clone(),
|
||||
),
|
||||
req_res::v1::AvailableDataFetchingRequest {
|
||||
candidate_hash: common_params.candidate_hash,
|
||||
},
|
||||
);
|
||||
|
||||
sender
|
||||
.send_message(NetworkBridgeTxMessage::SendRequests(
|
||||
vec![Requests::AvailableDataFetchingV1(req)],
|
||||
IfDisconnected::ImmediateError,
|
||||
))
|
||||
.await;
|
||||
|
||||
common_params.metrics.on_full_request_issued();
|
||||
|
||||
match response.await {
|
||||
Ok(req_res::v1::AvailableDataFetchingResponse::AvailableData(data)) => {
|
||||
let recovery_duration =
|
||||
common_params.metrics.time_erasure_recovery(strategy_type);
|
||||
let maybe_data = match common_params.post_recovery_check {
|
||||
PostRecoveryCheck::Reencode => {
|
||||
let (reencode_tx, reencode_rx) = oneshot::channel();
|
||||
let mut erasure_task_tx = common_params.erasure_task_tx.clone();
|
||||
|
||||
erasure_task_tx
|
||||
.send(ErasureTask::Reencode(
|
||||
common_params.n_validators,
|
||||
common_params.erasure_root,
|
||||
data,
|
||||
reencode_tx,
|
||||
))
|
||||
.await
|
||||
.map_err(|_| RecoveryError::ChannelClosed)?;
|
||||
|
||||
reencode_rx.await.map_err(|_| RecoveryError::ChannelClosed)?
|
||||
},
|
||||
PostRecoveryCheck::PovHash =>
|
||||
(data.pov.hash() == common_params.pov_hash).then_some(data),
|
||||
};
|
||||
|
||||
match maybe_data {
|
||||
Some(data) => {
|
||||
gum::trace!(
|
||||
target: LOG_TARGET,
|
||||
candidate_hash = ?common_params.candidate_hash,
|
||||
"Received full data",
|
||||
);
|
||||
|
||||
common_params.metrics.on_full_request_succeeded();
|
||||
return Ok(data);
|
||||
},
|
||||
None => {
|
||||
common_params.metrics.on_full_request_invalid();
|
||||
recovery_duration.map(|rd| rd.stop_and_discard());
|
||||
|
||||
gum::debug!(
|
||||
target: LOG_TARGET,
|
||||
candidate_hash = ?common_params.candidate_hash,
|
||||
?validator_index,
|
||||
"Invalid data response",
|
||||
);
|
||||
|
||||
// it doesn't help to report the peer with req/res.
|
||||
// we'll try the next backer.
|
||||
},
|
||||
}
|
||||
},
|
||||
Ok(req_res::v1::AvailableDataFetchingResponse::NoSuchData) => {
|
||||
common_params.metrics.on_full_request_no_such_data();
|
||||
},
|
||||
Err(e) => {
|
||||
match &e {
|
||||
RequestError::Canceled(_) => common_params.metrics.on_full_request_error(),
|
||||
RequestError::InvalidResponse(_) =>
|
||||
common_params.metrics.on_full_request_invalid(),
|
||||
RequestError::NetworkError(req_failure) => {
|
||||
if let RequestFailure::Network(OutboundFailure::Timeout) = req_failure {
|
||||
common_params.metrics.on_full_request_timeout();
|
||||
} else {
|
||||
common_params.metrics.on_full_request_error();
|
||||
}
|
||||
},
|
||||
};
|
||||
gum::debug!(
|
||||
target: LOG_TARGET,
|
||||
candidate_hash = ?common_params.candidate_hash,
|
||||
?validator_index,
|
||||
err = ?e,
|
||||
"Error fetching full available data."
|
||||
);
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,341 @@
|
||||
// Copyright (C) Parity Technologies (UK) Ltd.
|
||||
// This file is part of Pezkuwi.
|
||||
|
||||
// Pezkuwi is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Pezkuwi is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Pezkuwi. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
use crate::{
|
||||
futures_undead::FuturesUndead,
|
||||
task::{
|
||||
strategy::{
|
||||
do_post_recovery_check, is_unavailable, OngoingRequests, N_PARALLEL,
|
||||
SYSTEMATIC_CHUNKS_REQ_RETRY_LIMIT,
|
||||
},
|
||||
RecoveryParams, RecoveryStrategy, State,
|
||||
},
|
||||
LOG_TARGET,
|
||||
};
|
||||
|
||||
use pezkuwi_node_primitives::AvailableData;
|
||||
use pezkuwi_node_subsystem::{overseer, RecoveryError};
|
||||
use pezkuwi_primitives::{ChunkIndex, ValidatorIndex};
|
||||
|
||||
use std::collections::VecDeque;
|
||||
|
||||
/// Parameters needed for fetching systematic chunks.
|
||||
pub struct FetchSystematicChunksParams {
|
||||
/// Validators that hold the systematic chunks.
|
||||
pub validators: Vec<(ChunkIndex, ValidatorIndex)>,
|
||||
/// Validators in the backing group, to be used as a backup for requesting systematic chunks.
|
||||
pub backers: Vec<ValidatorIndex>,
|
||||
}
|
||||
|
||||
/// `RecoveryStrategy` that attempts to recover the systematic chunks from the validators that
|
||||
/// hold them, in order to bypass the erasure code reconstruction step, which is costly.
|
||||
pub struct FetchSystematicChunks {
|
||||
/// Systematic recovery threshold.
|
||||
threshold: usize,
|
||||
/// Validators that hold the systematic chunks.
|
||||
validators: Vec<(ChunkIndex, ValidatorIndex)>,
|
||||
/// Backers to be used as a backup.
|
||||
backers: Vec<ValidatorIndex>,
|
||||
/// Collection of in-flight requests.
|
||||
requesting_chunks: OngoingRequests,
|
||||
}
|
||||
|
||||
impl FetchSystematicChunks {
|
||||
/// Instantiate a new systematic chunks strategy.
|
||||
pub fn new(params: FetchSystematicChunksParams) -> Self {
|
||||
Self {
|
||||
threshold: params.validators.len(),
|
||||
validators: params.validators,
|
||||
backers: params.backers,
|
||||
requesting_chunks: FuturesUndead::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn is_unavailable(
|
||||
unrequested_validators: usize,
|
||||
in_flight_requests: usize,
|
||||
systematic_chunk_count: usize,
|
||||
threshold: usize,
|
||||
) -> bool {
|
||||
is_unavailable(
|
||||
systematic_chunk_count,
|
||||
in_flight_requests,
|
||||
unrequested_validators,
|
||||
threshold,
|
||||
)
|
||||
}
|
||||
|
||||
/// Desired number of parallel requests.
|
||||
///
|
||||
/// For the given threshold (total required number of chunks) get the desired number of
|
||||
/// requests we want to have running in parallel at this time.
|
||||
fn get_desired_request_count(&self, chunk_count: usize, threshold: usize) -> usize {
|
||||
// Upper bound for parallel requests.
|
||||
let max_requests_boundary = std::cmp::min(N_PARALLEL, threshold);
|
||||
// How many chunks are still needed?
|
||||
let remaining_chunks = threshold.saturating_sub(chunk_count);
|
||||
// Actual number of requests we want to have in flight in parallel:
|
||||
// We don't have to make up for any error rate, as an error fetching a systematic chunk
|
||||
// results in failure of the entire strategy.
|
||||
std::cmp::min(max_requests_boundary, remaining_chunks)
|
||||
}
|
||||
|
||||
async fn attempt_systematic_recovery<Sender: overseer::AvailabilityRecoverySenderTrait>(
|
||||
&mut self,
|
||||
state: &mut State,
|
||||
common_params: &RecoveryParams,
|
||||
) -> Result<AvailableData, RecoveryError> {
|
||||
let strategy_type = RecoveryStrategy::<Sender>::strategy_type(self);
|
||||
let recovery_duration = common_params.metrics.time_erasure_recovery(strategy_type);
|
||||
let reconstruct_duration = common_params.metrics.time_erasure_reconstruct(strategy_type);
|
||||
let chunks = state
|
||||
.received_chunks
|
||||
.range(
|
||||
ChunkIndex(0)..
|
||||
ChunkIndex(
|
||||
u32::try_from(self.threshold)
|
||||
.expect("validator count should not exceed u32"),
|
||||
),
|
||||
)
|
||||
.map(|(_, chunk)| chunk.chunk.clone())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let available_data = pezkuwi_erasure_coding::reconstruct_from_systematic_v1(
|
||||
common_params.n_validators,
|
||||
chunks,
|
||||
);
|
||||
|
||||
match available_data {
|
||||
Ok(data) => {
|
||||
drop(reconstruct_duration);
|
||||
|
||||
// Attempt post-recovery check.
|
||||
do_post_recovery_check(common_params, data)
|
||||
.await
|
||||
.inspect_err(|_| {
|
||||
recovery_duration.map(|rd| rd.stop_and_discard());
|
||||
})
|
||||
.inspect(|_| {
|
||||
gum::trace!(
|
||||
target: LOG_TARGET,
|
||||
candidate_hash = ?common_params.candidate_hash,
|
||||
erasure_root = ?common_params.erasure_root,
|
||||
"Data recovery from systematic chunks complete",
|
||||
);
|
||||
})
|
||||
},
|
||||
Err(err) => {
|
||||
reconstruct_duration.map(|rd| rd.stop_and_discard());
|
||||
recovery_duration.map(|rd| rd.stop_and_discard());
|
||||
|
||||
gum::debug!(
|
||||
target: LOG_TARGET,
|
||||
candidate_hash = ?common_params.candidate_hash,
|
||||
erasure_root = ?common_params.erasure_root,
|
||||
?err,
|
||||
"Systematic data recovery error",
|
||||
);
|
||||
|
||||
Err(RecoveryError::Invalid)
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl<Sender: overseer::AvailabilityRecoverySenderTrait> RecoveryStrategy<Sender>
|
||||
for FetchSystematicChunks
|
||||
{
|
||||
fn display_name(&self) -> &'static str {
|
||||
"Fetch systematic chunks"
|
||||
}
|
||||
|
||||
fn strategy_type(&self) -> &'static str {
|
||||
"systematic_chunks"
|
||||
}
|
||||
|
||||
async fn run(
|
||||
mut self: Box<Self>,
|
||||
state: &mut State,
|
||||
sender: &mut Sender,
|
||||
common_params: &RecoveryParams,
|
||||
) -> Result<AvailableData, RecoveryError> {
|
||||
// First query the store for any chunks we've got.
|
||||
if !common_params.bypass_availability_store {
|
||||
let local_chunk_indices = state.populate_from_av_store(common_params, sender).await;
|
||||
|
||||
for (_, our_c_index) in &local_chunk_indices {
|
||||
// If we are among the systematic validators but hold an invalid chunk, we cannot
|
||||
// perform the systematic recovery. Fall through to the next strategy.
|
||||
if self.validators.iter().any(|(c_index, _)| c_index == our_c_index) &&
|
||||
!state.received_chunks.contains_key(our_c_index)
|
||||
{
|
||||
gum::debug!(
|
||||
target: LOG_TARGET,
|
||||
candidate_hash = ?common_params.candidate_hash,
|
||||
erasure_root = ?common_params.erasure_root,
|
||||
requesting = %self.requesting_chunks.len(),
|
||||
total_requesting = %self.requesting_chunks.total_len(),
|
||||
n_validators = %common_params.n_validators,
|
||||
chunk_index = ?our_c_index,
|
||||
"Systematic chunk recovery is not possible. We are among the systematic validators but hold an invalid chunk",
|
||||
);
|
||||
return Err(RecoveryError::Unavailable);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// No need to query the validators that have the chunks we already received or that we know
|
||||
// don't have the data from previous strategies.
|
||||
self.validators.retain(|(c_index, v_index)| {
|
||||
!state.received_chunks.contains_key(c_index) &&
|
||||
state.can_retry_request(
|
||||
&(common_params.validator_authority_keys[v_index.0 as usize].clone(), *v_index),
|
||||
SYSTEMATIC_CHUNKS_REQ_RETRY_LIMIT,
|
||||
)
|
||||
});
|
||||
|
||||
let mut systematic_chunk_count = state
|
||||
.received_chunks
|
||||
.range(ChunkIndex(0)..ChunkIndex(self.threshold as u32))
|
||||
.count();
|
||||
|
||||
// Safe to `take` here, as we're consuming `self` anyway and we're not using the
|
||||
// `validators` or `backers` fields in other methods.
|
||||
let mut validators_queue: VecDeque<_> = std::mem::take(&mut self.validators)
|
||||
.into_iter()
|
||||
.map(|(_, validator_index)| {
|
||||
(
|
||||
common_params.validator_authority_keys[validator_index.0 as usize].clone(),
|
||||
validator_index,
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
let mut backers: Vec<_> = std::mem::take(&mut self.backers)
|
||||
.into_iter()
|
||||
.map(|validator_index| {
|
||||
common_params.validator_authority_keys[validator_index.0 as usize].clone()
|
||||
})
|
||||
.collect();
|
||||
|
||||
loop {
|
||||
// If received_chunks has `systematic_chunk_threshold` entries, attempt to recover the
|
||||
// data.
|
||||
if systematic_chunk_count >= self.threshold {
|
||||
return self.attempt_systematic_recovery::<Sender>(state, common_params).await;
|
||||
}
|
||||
|
||||
if Self::is_unavailable(
|
||||
validators_queue.len(),
|
||||
self.requesting_chunks.total_len(),
|
||||
systematic_chunk_count,
|
||||
self.threshold,
|
||||
) {
|
||||
gum::debug!(
|
||||
target: LOG_TARGET,
|
||||
candidate_hash = ?common_params.candidate_hash,
|
||||
erasure_root = ?common_params.erasure_root,
|
||||
%systematic_chunk_count,
|
||||
requesting = %self.requesting_chunks.len(),
|
||||
total_requesting = %self.requesting_chunks.total_len(),
|
||||
n_validators = %common_params.n_validators,
|
||||
systematic_threshold = ?self.threshold,
|
||||
"Data recovery from systematic chunks is not possible",
|
||||
);
|
||||
|
||||
return Err(RecoveryError::Unavailable);
|
||||
}
|
||||
|
||||
let desired_requests_count =
|
||||
self.get_desired_request_count(systematic_chunk_count, self.threshold);
|
||||
let already_requesting_count = self.requesting_chunks.len();
|
||||
gum::debug!(
|
||||
target: LOG_TARGET,
|
||||
?common_params.candidate_hash,
|
||||
?desired_requests_count,
|
||||
total_received = ?systematic_chunk_count,
|
||||
systematic_threshold = ?self.threshold,
|
||||
?already_requesting_count,
|
||||
"Requesting systematic availability chunks for a candidate",
|
||||
);
|
||||
|
||||
let strategy_type = RecoveryStrategy::<Sender>::strategy_type(&*self);
|
||||
|
||||
state
|
||||
.launch_parallel_chunk_requests(
|
||||
strategy_type,
|
||||
common_params,
|
||||
sender,
|
||||
desired_requests_count,
|
||||
&mut validators_queue,
|
||||
&mut self.requesting_chunks,
|
||||
)
|
||||
.await;
|
||||
|
||||
let _ = state
|
||||
.wait_for_chunks(
|
||||
strategy_type,
|
||||
common_params,
|
||||
SYSTEMATIC_CHUNKS_REQ_RETRY_LIMIT,
|
||||
&mut validators_queue,
|
||||
&mut self.requesting_chunks,
|
||||
&mut backers,
|
||||
|unrequested_validators,
|
||||
in_flight_reqs,
|
||||
// Don't use this chunk count, as it may contain non-systematic chunks.
|
||||
_chunk_count,
|
||||
new_systematic_chunk_count| {
|
||||
systematic_chunk_count = new_systematic_chunk_count;
|
||||
|
||||
let is_unavailable = Self::is_unavailable(
|
||||
unrequested_validators,
|
||||
in_flight_reqs,
|
||||
systematic_chunk_count,
|
||||
self.threshold,
|
||||
);
|
||||
|
||||
systematic_chunk_count >= self.threshold || is_unavailable
|
||||
},
|
||||
)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use pezkuwi_erasure_coding::systematic_recovery_threshold;
|
||||
|
||||
#[test]
|
||||
fn test_get_desired_request_count() {
|
||||
let num_validators = 100;
|
||||
let threshold = systematic_recovery_threshold(num_validators).unwrap();
|
||||
|
||||
let systematic_chunks_task = FetchSystematicChunks::new(FetchSystematicChunksParams {
|
||||
validators: vec![(1.into(), 1.into()); num_validators],
|
||||
backers: vec![],
|
||||
});
|
||||
assert_eq!(systematic_chunks_task.get_desired_request_count(0, threshold), threshold);
|
||||
assert_eq!(systematic_chunks_task.get_desired_request_count(5, threshold), threshold - 5);
|
||||
assert_eq!(
|
||||
systematic_chunks_task.get_desired_request_count(num_validators * 2, threshold),
|
||||
0
|
||||
);
|
||||
assert_eq!(systematic_chunks_task.get_desired_request_count(0, N_PARALLEL * 2), N_PARALLEL);
|
||||
assert_eq!(systematic_chunks_task.get_desired_request_count(N_PARALLEL, N_PARALLEL + 2), 2);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user