mirror of
https://github.com/pezkuwichain/pezkuwi-subxt.git
synced 2026-04-27 23:18:01 +00:00
Request based availability distribution (#2423)
* WIP * availability distribution, still very wip. Work on the requesting side of things. * Some docs on what I intend to do. * Checkpoint of session cache implementation as I will likely replace it with something smarter. * More work, mostly on cache and getting things to type check. * Only derive MallocSizeOf and Debug for std. * availability-distribution: Cache feature complete. * Sketch out logic in `FetchTask` for actual fetching. - Compile fixes. - Cleanup. * Format cleanup. * More format fixes. * Almost feature complete `fetch_task`. Missing: - Check for cancel - Actual querying of peer ids. * Finish FetchTask so far. * Directly use AuthorityDiscoveryId in protocol and cache. * Resolve `AuthorityDiscoveryId` on sending requests. * Rework fetch_task - also make it impossible to check the wrong chunk index. - Export needed function in validator_discovery. * From<u32> implementation for `ValidatorIndex`. * Fixes and more integration work. * Make session cache proper lru cache. * Use proper lru cache. * Requester finished. * ProtocolState -> Requester Also make sure to not fetch our own chunk. * Cleanup + fixes. * Remove unused functions - FetchTask::is_finished - SessionCache::fetch_session_info * availability-distribution responding side. * Cleanup + Fixes. * More fixes. * More fixes. adder-collator is running! * Some docs. * Docs. * Fix reporting of bad guys. * Fix tests * Make all tests compile. * Fix test. * Cleanup + get rid of some warnings. * state -> requester * Mostly doc fixes. * Fix test suite. * Get rid of now redundant message types. * WIP * Rob's review remarks. * Fix test suite. * core.relay_parent -> leaf for session request. * Style fix. * Decrease request timeout. * Cleanup obsolete errors. * Metrics + don't fail on non fatal errors. * requester.rs -> requester/mod.rs * Panic on invalid BadValidator report. * Fix indentation. * Use typed default timeout constant. * Make channel size 0, as each sender gets one slot anyways. * Fix incorrect metrics initialization. * Fix build after merge. * More fixes. * Hopefully valid metrics names. * Better metrics names. * Some tests that already work. * Slightly better docs. * Some more tests. * Fix network bridge test.
This commit is contained in:
@@ -0,0 +1,89 @@
|
||||
// Copyright 2021 Parity Technologies (UK) Ltd.
|
||||
// This file is part of Polkadot.
|
||||
|
||||
// Polkadot is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Polkadot is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
|
||||
//
|
||||
|
||||
//! Error handling related code and Error/Result definitions.
|
||||
|
||||
use thiserror::Error;
|
||||
|
||||
use futures::channel::oneshot;
|
||||
|
||||
use polkadot_node_subsystem_util::Error as UtilError;
|
||||
use polkadot_primitives::v1::SessionIndex;
|
||||
use polkadot_subsystem::{errors::RuntimeApiError, SubsystemError};
|
||||
|
||||
/// Errors of this subsystem.
|
||||
#[derive(Debug, Error)]
|
||||
pub enum Error {
|
||||
#[error("Response channel to obtain QueryChunk failed")]
|
||||
QueryChunkResponseChannel(#[source] oneshot::Canceled),
|
||||
|
||||
#[error("Receive channel closed")]
|
||||
IncomingMessageChannel(#[source] SubsystemError),
|
||||
|
||||
/// Some request to utility functions failed.
|
||||
#[error("Runtime request failed")]
|
||||
UtilRequest(#[source] UtilError),
|
||||
|
||||
/// Some request to the runtime failed.
|
||||
#[error("Runtime request failed")]
|
||||
RuntimeRequestCanceled(#[source] oneshot::Canceled),
|
||||
|
||||
/// Some request to the runtime failed.
|
||||
#[error("Runtime request failed")]
|
||||
RuntimeRequest(#[source] RuntimeApiError),
|
||||
|
||||
/// We tried fetching a session which was not available.
|
||||
#[error("No such session")]
|
||||
NoSuchSession(SessionIndex),
|
||||
|
||||
/// Spawning a running task failed.
|
||||
#[error("Spawning subsystem task failed")]
|
||||
SpawnTask(#[source] SubsystemError),
|
||||
|
||||
/// We tried accessing a session that was not cached.
|
||||
#[error("Session is not cached.")]
|
||||
NoSuchCachedSession,
|
||||
|
||||
/// Requester stream exhausted.
|
||||
#[error("Erasure chunk requester stream exhausted")]
|
||||
RequesterExhausted,
|
||||
|
||||
/// Sending response failed.
|
||||
#[error("Sending a request's response failed.")]
|
||||
SendResponse,
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
impl From<SubsystemError> for Error {
|
||||
fn from(err: SubsystemError) -> Self {
|
||||
Self::IncomingMessageChannel(err)
|
||||
}
|
||||
}
|
||||
|
||||
/// Receive a response from a runtime request and convert errors.
|
||||
pub(crate) async fn recv_runtime<V>(
|
||||
r: std::result::Result<
|
||||
oneshot::Receiver<std::result::Result<V, RuntimeApiError>>,
|
||||
UtilError,
|
||||
>,
|
||||
) -> Result<V> {
|
||||
r.map_err(Error::UtilRequest)?
|
||||
.await
|
||||
.map_err(Error::RuntimeRequestCanceled)?
|
||||
.map_err(Error::RuntimeRequest)
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,117 @@
|
||||
// Copyright 2021 Parity Technologies (UK) Ltd.
|
||||
// This file is part of Polkadot.
|
||||
|
||||
// Polkadot is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Polkadot is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
use polkadot_node_subsystem_util::metrics::prometheus::{Counter, U64, Registry, PrometheusError, CounterVec, Opts};
|
||||
use polkadot_node_subsystem_util::metrics::prometheus;
|
||||
use polkadot_node_subsystem_util::metrics;
|
||||
|
||||
/// Label for success counters.
|
||||
pub const SUCCEEDED: &'static str = "succeeded";
|
||||
|
||||
/// Label for fail counters.
|
||||
pub const FAILED: &'static str = "failed";
|
||||
|
||||
/// Label for chunks that could not be served, because they were not available.
|
||||
pub const NOT_FOUND: &'static str = "not-found";
|
||||
|
||||
/// Availability Distribution metrics.
|
||||
#[derive(Clone, Default)]
|
||||
pub struct Metrics(Option<MetricsInner>);
|
||||
|
||||
|
||||
#[derive(Clone)]
|
||||
struct MetricsInner {
|
||||
/// Number of chunks fetched.
|
||||
///
|
||||
/// Note: The failed count gets incremented, when we were not able to fetch the chunk at all.
|
||||
/// For times, where we failed downloading, but succeeded on the next try (with different
|
||||
/// backers), see `retries`.
|
||||
fetched_chunks: CounterVec<U64>,
|
||||
|
||||
/// Number of chunks served.
|
||||
///
|
||||
/// Note: Right now, `Succeeded` gets incremented whenever we were able to successfully respond
|
||||
/// to a chunk request. This includes `NoSuchChunk` responses.
|
||||
served_chunks: CounterVec<U64>,
|
||||
|
||||
/// Number of times our first set of validators did not provide the needed chunk and we had to
|
||||
/// query further validators.
|
||||
retries: Counter<U64>,
|
||||
}
|
||||
|
||||
impl Metrics {
|
||||
/// Create new dummy metrics, not reporting anything.
|
||||
pub fn new_dummy() -> Self {
|
||||
Metrics(None)
|
||||
}
|
||||
|
||||
/// Increment counter on fetched labels.
|
||||
pub fn on_fetch(&self, label: &'static str) {
|
||||
if let Some(metrics) = &self.0 {
|
||||
metrics.fetched_chunks.with_label_values(&[label]).inc()
|
||||
}
|
||||
}
|
||||
|
||||
/// Increment counter on served chunks.
|
||||
pub fn on_served(&self, label: &'static str) {
|
||||
if let Some(metrics) = &self.0 {
|
||||
metrics.served_chunks.with_label_values(&[label]).inc()
|
||||
}
|
||||
}
|
||||
|
||||
/// Increment retry counter.
|
||||
pub fn on_retry(&self) {
|
||||
if let Some(metrics) = &self.0 {
|
||||
metrics.retries.inc()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl metrics::Metrics for Metrics {
|
||||
fn try_register(registry: &Registry) -> Result<Self, PrometheusError> {
|
||||
let metrics = MetricsInner {
|
||||
fetched_chunks: prometheus::register(
|
||||
CounterVec::new(
|
||||
Opts::new(
|
||||
"parachain_fetched_chunks_total",
|
||||
"Total number of fetched chunks.",
|
||||
),
|
||||
&["success"]
|
||||
)?,
|
||||
registry,
|
||||
)?,
|
||||
served_chunks: prometheus::register(
|
||||
CounterVec::new(
|
||||
Opts::new(
|
||||
"parachain_served_chunks_total",
|
||||
"Total number of chunks served by this backer.",
|
||||
),
|
||||
&["success"]
|
||||
)?,
|
||||
registry,
|
||||
)?,
|
||||
retries: prometheus::register(
|
||||
Counter::new(
|
||||
"parachain_fetch_retries_total",
|
||||
"Number of times we did not succeed in fetching a chunk and needed to try more backers.",
|
||||
)?,
|
||||
registry,
|
||||
)?,
|
||||
};
|
||||
Ok(Metrics(Some(metrics)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,421 @@
|
||||
// Copyright 2021 Parity Technologies (UK) Ltd.
|
||||
// This file is part of Polkadot.
|
||||
|
||||
// Polkadot is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Polkadot is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
use std::collections::HashSet;
|
||||
|
||||
use futures::channel::mpsc;
|
||||
use futures::channel::oneshot;
|
||||
use futures::future::select;
|
||||
use futures::{FutureExt, SinkExt};
|
||||
|
||||
use polkadot_erasure_coding::branch_hash;
|
||||
use polkadot_node_network_protocol::request_response::{
|
||||
request::{OutgoingRequest, RequestError, Requests},
|
||||
v1::{AvailabilityFetchingRequest, AvailabilityFetchingResponse},
|
||||
};
|
||||
use polkadot_primitives::v1::{
|
||||
AuthorityDiscoveryId, BlakeTwo256, ErasureChunk, GroupIndex, Hash, HashT, OccupiedCore,
|
||||
SessionIndex,
|
||||
};
|
||||
use polkadot_subsystem::messages::{
|
||||
AllMessages, AvailabilityStoreMessage, NetworkBridgeMessage,
|
||||
};
|
||||
use polkadot_subsystem::SubsystemContext;
|
||||
|
||||
use crate::{
|
||||
error::{Error, Result},
|
||||
session_cache::{BadValidators, SessionInfo},
|
||||
LOG_TARGET,
|
||||
metrics::{Metrics, SUCCEEDED, FAILED},
|
||||
};
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
/// Configuration for a `FetchTask`
|
||||
///
|
||||
/// This exists to separate preparation of a `FetchTask` from actual starting it, which is
|
||||
/// beneficial as this allows as for taking session info by reference.
|
||||
pub struct FetchTaskConfig {
|
||||
prepared_running: Option<RunningTask>,
|
||||
live_in: HashSet<Hash>,
|
||||
}
|
||||
|
||||
/// Information about a task fetching an erasure chunk.
|
||||
pub struct FetchTask {
|
||||
/// For what relay parents this task is relevant.
|
||||
///
|
||||
/// In other words, for which relay chain parents this candidate is considered live.
|
||||
/// This is updated on every `ActiveLeavesUpdate` and enables us to know when we can safely
|
||||
/// stop keeping track of that candidate/chunk.
|
||||
live_in: HashSet<Hash>,
|
||||
|
||||
/// We keep the task around in until `live_in` becomes empty, to make
|
||||
/// sure we won't re-fetch an already fetched candidate.
|
||||
state: FetchedState,
|
||||
}
|
||||
|
||||
/// State of a particular candidate chunk fetching process.
|
||||
enum FetchedState {
|
||||
/// Chunk fetch has started.
|
||||
///
|
||||
/// Once the contained `Sender` is dropped, any still running task will be canceled.
|
||||
Started(oneshot::Sender<()>),
|
||||
/// All relevant live_in have been removed, before we were able to get our chunk.
|
||||
Canceled,
|
||||
}
|
||||
|
||||
/// Messages sent from `FetchTask`s to be handled/forwarded.
|
||||
pub enum FromFetchTask {
|
||||
/// Message to other subsystem.
|
||||
Message(AllMessages),
|
||||
|
||||
/// Concluded with result.
|
||||
///
|
||||
/// In case of `None` everything was fine, in case of `Some`, some validators in the group
|
||||
/// did not serve us our chunk as expected.
|
||||
Concluded(Option<BadValidators>),
|
||||
}
|
||||
|
||||
/// Information a running task needs.
|
||||
struct RunningTask {
|
||||
/// For what session we have been spawned.
|
||||
session_index: SessionIndex,
|
||||
|
||||
/// Index of validator group to fetch the chunk from.
|
||||
///
|
||||
/// Needef for reporting bad validators.
|
||||
group_index: GroupIndex,
|
||||
|
||||
/// Validators to request the chunk from.
|
||||
///
|
||||
/// This vector gets drained during execution of the task (it will be empty afterwards).
|
||||
group: Vec<AuthorityDiscoveryId>,
|
||||
|
||||
/// The request to send.
|
||||
request: AvailabilityFetchingRequest,
|
||||
|
||||
/// Root hash, for verifying the chunks validity.
|
||||
erasure_root: Hash,
|
||||
|
||||
/// Relay parent of the candidate to fetch.
|
||||
relay_parent: Hash,
|
||||
|
||||
/// Sender for communicating with other subsystems and reporting results.
|
||||
sender: mpsc::Sender<FromFetchTask>,
|
||||
|
||||
/// Prometheues metrics for reporting results.
|
||||
metrics: Metrics,
|
||||
}
|
||||
|
||||
impl FetchTaskConfig {
|
||||
/// Create a new configuration for a [`FetchTask`].
|
||||
///
|
||||
/// The result of this function can be passed into [`FetchTask::start`].
|
||||
pub fn new(
|
||||
leaf: Hash,
|
||||
core: &OccupiedCore,
|
||||
sender: mpsc::Sender<FromFetchTask>,
|
||||
metrics: Metrics,
|
||||
session_info: &SessionInfo,
|
||||
) -> Self {
|
||||
let live_in = vec![leaf].into_iter().collect();
|
||||
|
||||
// Don't run tasks for our backing group:
|
||||
if session_info.our_group == core.group_responsible {
|
||||
return FetchTaskConfig {
|
||||
live_in,
|
||||
prepared_running: None,
|
||||
};
|
||||
}
|
||||
|
||||
let prepared_running = RunningTask {
|
||||
session_index: session_info.session_index,
|
||||
group_index: core.group_responsible,
|
||||
group: session_info.validator_groups.get(core.group_responsible.0 as usize)
|
||||
.expect("The responsible group of a candidate should be available in the corresponding session. qed.")
|
||||
.clone(),
|
||||
request: AvailabilityFetchingRequest {
|
||||
candidate_hash: core.candidate_hash,
|
||||
index: session_info.our_index,
|
||||
},
|
||||
erasure_root: core.candidate_descriptor.erasure_root,
|
||||
relay_parent: core.candidate_descriptor.relay_parent,
|
||||
metrics,
|
||||
sender,
|
||||
};
|
||||
FetchTaskConfig {
|
||||
live_in,
|
||||
prepared_running: Some(prepared_running),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FetchTask {
|
||||
/// Start fetching a chunk.
|
||||
///
|
||||
/// A task handling the fetching of the configured chunk will be spawned.
|
||||
pub async fn start<Context>(config: FetchTaskConfig, ctx: &mut Context) -> Result<Self>
|
||||
where
|
||||
Context: SubsystemContext,
|
||||
{
|
||||
let FetchTaskConfig {
|
||||
prepared_running,
|
||||
live_in,
|
||||
} = config;
|
||||
|
||||
if let Some(running) = prepared_running {
|
||||
let (handle, kill) = oneshot::channel();
|
||||
|
||||
ctx.spawn("chunk-fetcher", running.run(kill).boxed())
|
||||
.await
|
||||
.map_err(|e| Error::SpawnTask(e))?;
|
||||
|
||||
Ok(FetchTask {
|
||||
live_in,
|
||||
state: FetchedState::Started(handle),
|
||||
})
|
||||
} else {
|
||||
Ok(FetchTask {
|
||||
live_in,
|
||||
state: FetchedState::Canceled,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Add the given leaf to the relay parents which are making this task relevant.
|
||||
///
|
||||
/// This is for book keeping, so we know we are already fetching a given chunk.
|
||||
pub fn add_leaf(&mut self, leaf: Hash) {
|
||||
self.live_in.insert(leaf);
|
||||
}
|
||||
|
||||
/// Remove leaves and cancel the task, if it was the last one and the task has still been
|
||||
/// fetching.
|
||||
pub fn remove_leaves(&mut self, leaves: &HashSet<Hash>) {
|
||||
self.live_in.difference(leaves);
|
||||
if self.live_in.is_empty() && !self.is_finished() {
|
||||
self.state = FetchedState::Canceled
|
||||
}
|
||||
}
|
||||
|
||||
/// Whether or not there are still relay parents around with this candidate pending
|
||||
/// availability.
|
||||
pub fn is_live(&self) -> bool {
|
||||
!self.live_in.is_empty()
|
||||
}
|
||||
|
||||
/// Whether or not this task can be considered finished.
|
||||
///
|
||||
/// That is, it is either canceled, succeeded or failed.
|
||||
pub fn is_finished(&self) -> bool {
|
||||
match &self.state {
|
||||
FetchedState::Canceled => true,
|
||||
FetchedState::Started(sender) => sender.is_canceled(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Things that can go wrong in task execution.
|
||||
#[derive(Debug)]
|
||||
enum TaskError {
|
||||
/// The peer failed to deliver a correct chunk for some reason (has been reported as
|
||||
/// appropriate).
|
||||
PeerError,
|
||||
/// This very node is seemingly shutting down (sending of message failed).
|
||||
ShuttingDown,
|
||||
}
|
||||
|
||||
impl RunningTask {
|
||||
async fn run(self, kill: oneshot::Receiver<()>) {
|
||||
// Wait for completion/or cancel.
|
||||
let run_it = self.run_inner();
|
||||
futures::pin_mut!(run_it);
|
||||
let _ = select(run_it, kill).await;
|
||||
}
|
||||
|
||||
/// Fetch and store chunk.
|
||||
///
|
||||
/// Try validators in backing group in order.
|
||||
async fn run_inner(mut self) {
|
||||
let mut bad_validators = Vec::new();
|
||||
let mut label = FAILED;
|
||||
let mut count: u32 = 0;
|
||||
// Try validators in reverse order:
|
||||
while let Some(validator) = self.group.pop() {
|
||||
// Report retries:
|
||||
if count > 0 {
|
||||
self.metrics.on_retry();
|
||||
}
|
||||
count +=1;
|
||||
|
||||
// Send request:
|
||||
let resp = match self.do_request(&validator).await {
|
||||
Ok(resp) => resp,
|
||||
Err(TaskError::ShuttingDown) => {
|
||||
tracing::info!(
|
||||
target: LOG_TARGET,
|
||||
"Node seems to be shutting down, canceling fetch task"
|
||||
);
|
||||
self.metrics.on_fetch(FAILED);
|
||||
return
|
||||
}
|
||||
Err(TaskError::PeerError) => {
|
||||
bad_validators.push(validator);
|
||||
continue
|
||||
}
|
||||
};
|
||||
let chunk = match resp {
|
||||
AvailabilityFetchingResponse::Chunk(resp) => {
|
||||
resp.recombine_into_chunk(&self.request)
|
||||
}
|
||||
AvailabilityFetchingResponse::NoSuchChunk => {
|
||||
tracing::debug!(
|
||||
target: LOG_TARGET,
|
||||
validator = ?validator,
|
||||
"Validator did not have our chunk"
|
||||
);
|
||||
bad_validators.push(validator);
|
||||
continue
|
||||
}
|
||||
};
|
||||
|
||||
// Data genuine?
|
||||
if !self.validate_chunk(&validator, &chunk) {
|
||||
bad_validators.push(validator);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Ok, let's store it and be happy:
|
||||
self.store_chunk(chunk).await;
|
||||
label = SUCCEEDED;
|
||||
break;
|
||||
}
|
||||
self.metrics.on_fetch(label);
|
||||
self.conclude(bad_validators).await;
|
||||
}
|
||||
|
||||
/// Do request and return response, if successful.
|
||||
async fn do_request(
|
||||
&mut self,
|
||||
validator: &AuthorityDiscoveryId,
|
||||
) -> std::result::Result<AvailabilityFetchingResponse, TaskError> {
|
||||
let (full_request, response_recv) =
|
||||
OutgoingRequest::new(validator.clone(), self.request);
|
||||
let requests = Requests::AvailabilityFetching(full_request);
|
||||
|
||||
self.sender
|
||||
.send(FromFetchTask::Message(AllMessages::NetworkBridge(
|
||||
NetworkBridgeMessage::SendRequests(vec![requests]),
|
||||
)))
|
||||
.await
|
||||
.map_err(|_| TaskError::ShuttingDown)?;
|
||||
|
||||
match response_recv.await {
|
||||
Ok(resp) => Ok(resp),
|
||||
Err(RequestError::InvalidResponse(err)) => {
|
||||
tracing::warn!(
|
||||
target: LOG_TARGET,
|
||||
origin= ?validator,
|
||||
err= ?err,
|
||||
"Peer sent us invalid erasure chunk data"
|
||||
);
|
||||
Err(TaskError::PeerError)
|
||||
}
|
||||
Err(RequestError::NetworkError(err)) => {
|
||||
tracing::warn!(
|
||||
target: LOG_TARGET,
|
||||
origin= ?validator,
|
||||
err= ?err,
|
||||
"Some network error occurred when fetching erasure chunk"
|
||||
);
|
||||
Err(TaskError::PeerError)
|
||||
}
|
||||
Err(RequestError::Canceled(oneshot::Canceled)) => {
|
||||
tracing::warn!(target: LOG_TARGET,
|
||||
origin= ?validator,
|
||||
"Erasure chunk request got canceled");
|
||||
Err(TaskError::PeerError)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn validate_chunk(&self, validator: &AuthorityDiscoveryId, chunk: &ErasureChunk) -> bool {
|
||||
let anticipated_hash =
|
||||
match branch_hash(&self.erasure_root, &chunk.proof, chunk.index.0 as usize) {
|
||||
Ok(hash) => hash,
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
target: LOG_TARGET,
|
||||
candidate_hash = ?self.request.candidate_hash,
|
||||
origin = ?validator,
|
||||
error = ?e,
|
||||
"Failed to calculate chunk merkle proof",
|
||||
);
|
||||
return false;
|
||||
}
|
||||
};
|
||||
let erasure_chunk_hash = BlakeTwo256::hash(&chunk.chunk);
|
||||
if anticipated_hash != erasure_chunk_hash {
|
||||
tracing::warn!(target: LOG_TARGET, origin = ?validator, "Received chunk does not match merkle tree");
|
||||
return false;
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
/// Store given chunk and log any error.
|
||||
async fn store_chunk(&mut self, chunk: ErasureChunk) {
|
||||
let (tx, rx) = oneshot::channel();
|
||||
let r = self
|
||||
.sender
|
||||
.send(FromFetchTask::Message(AllMessages::AvailabilityStore(
|
||||
AvailabilityStoreMessage::StoreChunk {
|
||||
candidate_hash: self.request.candidate_hash,
|
||||
relay_parent: self.relay_parent,
|
||||
chunk,
|
||||
tx,
|
||||
},
|
||||
)))
|
||||
.await;
|
||||
if let Err(err) = r {
|
||||
tracing::error!(target: LOG_TARGET, err= ?err, "Storing erasure chunk failed, system shutting down?");
|
||||
}
|
||||
|
||||
if let Err(oneshot::Canceled) = rx.await {
|
||||
tracing::error!(target: LOG_TARGET, "Storing erasure chunk failed");
|
||||
}
|
||||
}
|
||||
|
||||
/// Tell subsystem we are done.
|
||||
async fn conclude(&mut self, bad_validators: Vec<AuthorityDiscoveryId>) {
|
||||
let payload = if bad_validators.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(BadValidators {
|
||||
session_index: self.session_index,
|
||||
group_index: self.group_index,
|
||||
bad_validators,
|
||||
})
|
||||
};
|
||||
if let Err(err) = self.sender.send(FromFetchTask::Concluded(payload)).await {
|
||||
tracing::warn!(
|
||||
target: LOG_TARGET,
|
||||
err= ?err,
|
||||
"Sending concluded message for task failed"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,315 @@
|
||||
// Copyright 2021 Parity Technologies (UK) Ltd.
|
||||
// This file is part of Polkadot.
|
||||
|
||||
// Polkadot is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Polkadot is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use parity_scale_codec::Encode;
|
||||
|
||||
use futures::channel::{mpsc, oneshot};
|
||||
use futures::{executor, Future, FutureExt, StreamExt, select};
|
||||
use futures::task::{Poll, Context, noop_waker};
|
||||
|
||||
use polkadot_erasure_coding::{obtain_chunks_v1 as obtain_chunks, branches};
|
||||
use sc_network as network;
|
||||
use sp_keyring::Sr25519Keyring;
|
||||
|
||||
use polkadot_primitives::v1::{AvailableData, BlockData, CandidateHash, HeadData, PersistedValidationData, PoV, ValidatorIndex};
|
||||
use polkadot_node_network_protocol::request_response::v1;
|
||||
use polkadot_subsystem::messages::AllMessages;
|
||||
|
||||
use crate::metrics::Metrics;
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn task_can_be_canceled() {
|
||||
let (task, _rx) = get_test_running_task();
|
||||
let (handle, kill) = oneshot::channel();
|
||||
std::mem::drop(handle);
|
||||
let running_task = task.run(kill);
|
||||
futures::pin_mut!(running_task);
|
||||
let waker = noop_waker();
|
||||
let mut ctx = Context::from_waker(&waker);
|
||||
assert!(running_task.poll(&mut ctx) == Poll::Ready(()), "Task is immediately finished");
|
||||
}
|
||||
|
||||
/// Make sure task won't accept a chunk that has is invalid.
|
||||
#[test]
|
||||
fn task_does_not_accept_invalid_chunk() {
|
||||
let (mut task, rx) = get_test_running_task();
|
||||
let validators = vec![Sr25519Keyring::Alice.public().into()];
|
||||
task.group = validators;
|
||||
let test = TestRun {
|
||||
chunk_responses: {
|
||||
let mut m = HashMap::new();
|
||||
m.insert(
|
||||
Sr25519Keyring::Alice.public().into(),
|
||||
AvailabilityFetchingResponse::Chunk(
|
||||
v1::ChunkResponse {
|
||||
chunk: vec![1,2,3],
|
||||
proof: vec![vec![9,8,2], vec![2,3,4]],
|
||||
}
|
||||
)
|
||||
);
|
||||
m
|
||||
},
|
||||
valid_chunks: HashSet::new(),
|
||||
};
|
||||
test.run(task, rx);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn task_stores_valid_chunk() {
|
||||
let (mut task, rx) = get_test_running_task();
|
||||
let (root_hash, chunk) = get_valid_chunk_data();
|
||||
task.erasure_root = root_hash;
|
||||
task.request.index = chunk.index;
|
||||
|
||||
let validators = vec![Sr25519Keyring::Alice.public().into()];
|
||||
task.group = validators;
|
||||
|
||||
let test = TestRun {
|
||||
chunk_responses: {
|
||||
let mut m = HashMap::new();
|
||||
m.insert(
|
||||
Sr25519Keyring::Alice.public().into(),
|
||||
AvailabilityFetchingResponse::Chunk(
|
||||
v1::ChunkResponse {
|
||||
chunk: chunk.chunk.clone(),
|
||||
proof: chunk.proof,
|
||||
}
|
||||
)
|
||||
);
|
||||
m
|
||||
},
|
||||
valid_chunks: {
|
||||
let mut s = HashSet::new();
|
||||
s.insert(chunk.chunk);
|
||||
s
|
||||
},
|
||||
};
|
||||
test.run(task, rx);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn task_does_not_accept_wrongly_indexed_chunk() {
|
||||
let (mut task, rx) = get_test_running_task();
|
||||
let (root_hash, chunk) = get_valid_chunk_data();
|
||||
task.erasure_root = root_hash;
|
||||
task.request.index = ValidatorIndex(chunk.index.0+1);
|
||||
|
||||
let validators = vec![Sr25519Keyring::Alice.public().into()];
|
||||
task.group = validators;
|
||||
|
||||
let test = TestRun {
|
||||
chunk_responses: {
|
||||
let mut m = HashMap::new();
|
||||
m.insert(
|
||||
Sr25519Keyring::Alice.public().into(),
|
||||
AvailabilityFetchingResponse::Chunk(
|
||||
v1::ChunkResponse {
|
||||
chunk: chunk.chunk.clone(),
|
||||
proof: chunk.proof,
|
||||
}
|
||||
)
|
||||
);
|
||||
m
|
||||
},
|
||||
valid_chunks: HashSet::new(),
|
||||
};
|
||||
test.run(task, rx);
|
||||
}
|
||||
|
||||
/// Task stores chunk, if there is at least one validator having a valid chunk.
|
||||
#[test]
|
||||
fn task_stores_valid_chunk_if_there_is_one() {
|
||||
let (mut task, rx) = get_test_running_task();
|
||||
let (root_hash, chunk) = get_valid_chunk_data();
|
||||
task.erasure_root = root_hash;
|
||||
task.request.index = chunk.index;
|
||||
|
||||
let validators = [
|
||||
// Only Alice has valid chunk - should succeed, even though she is tried last.
|
||||
Sr25519Keyring::Alice,
|
||||
Sr25519Keyring::Bob, Sr25519Keyring::Charlie,
|
||||
Sr25519Keyring::Dave, Sr25519Keyring::Eve,
|
||||
]
|
||||
.iter().map(|v| v.public().into()).collect::<Vec<_>>();
|
||||
task.group = validators;
|
||||
|
||||
let test = TestRun {
|
||||
chunk_responses: {
|
||||
let mut m = HashMap::new();
|
||||
m.insert(
|
||||
Sr25519Keyring::Alice.public().into(),
|
||||
AvailabilityFetchingResponse::Chunk(
|
||||
v1::ChunkResponse {
|
||||
chunk: chunk.chunk.clone(),
|
||||
proof: chunk.proof,
|
||||
}
|
||||
)
|
||||
);
|
||||
m.insert(
|
||||
Sr25519Keyring::Bob.public().into(),
|
||||
AvailabilityFetchingResponse::NoSuchChunk
|
||||
);
|
||||
m.insert(
|
||||
Sr25519Keyring::Charlie.public().into(),
|
||||
AvailabilityFetchingResponse::Chunk(
|
||||
v1::ChunkResponse {
|
||||
chunk: vec![1,2,3],
|
||||
proof: vec![vec![9,8,2], vec![2,3,4]],
|
||||
}
|
||||
)
|
||||
);
|
||||
|
||||
m
|
||||
},
|
||||
valid_chunks: {
|
||||
let mut s = HashSet::new();
|
||||
s.insert(chunk.chunk);
|
||||
s
|
||||
},
|
||||
};
|
||||
test.run(task, rx);
|
||||
}
|
||||
|
||||
struct TestRun {
|
||||
/// Response to deliver for a given validator index.
|
||||
/// None means, answer with NetworkError.
|
||||
chunk_responses: HashMap<AuthorityDiscoveryId, AvailabilityFetchingResponse>,
|
||||
/// Set of chunks that should be considered valid:
|
||||
valid_chunks: HashSet<Vec<u8>>,
|
||||
}
|
||||
|
||||
|
||||
impl TestRun {
|
||||
fn run(self, task: RunningTask, rx: mpsc::Receiver<FromFetchTask>) {
|
||||
sp_tracing::try_init_simple();
|
||||
let mut rx = rx.fuse();
|
||||
let task = task.run_inner().fuse();
|
||||
futures::pin_mut!(task);
|
||||
executor::block_on(async {
|
||||
let mut end_ok = false;
|
||||
loop {
|
||||
let msg = select!(
|
||||
from_task = rx.next() => {
|
||||
match from_task {
|
||||
Some(msg) => msg,
|
||||
None => break,
|
||||
}
|
||||
},
|
||||
() = task =>
|
||||
break,
|
||||
);
|
||||
match msg {
|
||||
FromFetchTask::Concluded(_) => break,
|
||||
FromFetchTask::Message(msg) =>
|
||||
end_ok = self.handle_message(msg).await,
|
||||
}
|
||||
}
|
||||
if !end_ok {
|
||||
panic!("Task ended prematurely (failed to store valid chunk)!");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/// Returns true, if after processing of the given message it would be ok for the stream to
|
||||
/// end.
|
||||
async fn handle_message(&self, msg: AllMessages) -> bool {
|
||||
match msg {
|
||||
AllMessages::NetworkBridge(NetworkBridgeMessage::SendRequests(reqs)) => {
|
||||
let mut valid_responses = 0;
|
||||
for req in reqs {
|
||||
let req = match req {
|
||||
Requests::AvailabilityFetching(req) => req,
|
||||
};
|
||||
let response = self.chunk_responses.get(&req.peer)
|
||||
.ok_or(network::RequestFailure::Refused);
|
||||
|
||||
if let Ok(AvailabilityFetchingResponse::Chunk(resp)) = &response {
|
||||
if self.valid_chunks.contains(&resp.chunk) {
|
||||
valid_responses += 1;
|
||||
}
|
||||
}
|
||||
req.pending_response.send(response.map(Encode::encode))
|
||||
.expect("Sending response should succeed");
|
||||
}
|
||||
return (valid_responses == 0) && self.valid_chunks.is_empty()
|
||||
}
|
||||
AllMessages::AvailabilityStore(
|
||||
AvailabilityStoreMessage::StoreChunk { chunk, tx, .. }
|
||||
) => {
|
||||
assert!(self.valid_chunks.contains(&chunk.chunk));
|
||||
tx.send(Ok(())).expect("Answering fetching task should work");
|
||||
return true
|
||||
}
|
||||
_ => {
|
||||
tracing::debug!(target: LOG_TARGET, "Unexpected message");
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a `RunningTask` filled with dummy values.
|
||||
fn get_test_running_task() -> (RunningTask, mpsc::Receiver<FromFetchTask>) {
|
||||
let (tx,rx) = mpsc::channel(0);
|
||||
|
||||
(
|
||||
RunningTask {
|
||||
session_index: 0,
|
||||
group_index: GroupIndex(0),
|
||||
group: Vec::new(),
|
||||
request: AvailabilityFetchingRequest {
|
||||
candidate_hash: CandidateHash([43u8;32].into()),
|
||||
index: ValidatorIndex(0),
|
||||
},
|
||||
erasure_root: Hash::repeat_byte(99),
|
||||
relay_parent: Hash::repeat_byte(71),
|
||||
sender: tx,
|
||||
metrics: Metrics::new_dummy(),
|
||||
},
|
||||
rx
|
||||
)
|
||||
}
|
||||
|
||||
fn get_valid_chunk_data() -> (Hash, ErasureChunk) {
|
||||
let fake_validator_count = 10;
|
||||
let persisted = PersistedValidationData {
|
||||
parent_head: HeadData(vec![7, 8, 9]),
|
||||
relay_parent_number: Default::default(),
|
||||
max_pov_size: 1024,
|
||||
relay_parent_storage_root: Default::default(),
|
||||
};
|
||||
let pov_block = PoV {
|
||||
block_data: BlockData(vec![45, 46, 47]),
|
||||
};
|
||||
let available_data = AvailableData {
|
||||
validation_data: persisted, pov: Arc::new(pov_block),
|
||||
};
|
||||
let chunks = obtain_chunks(fake_validator_count, &available_data).unwrap();
|
||||
let branches = branches(chunks.as_ref());
|
||||
let root = branches.root();
|
||||
let chunk = branches.enumerate()
|
||||
.map(|(index, (proof, chunk))| ErasureChunk {
|
||||
chunk: chunk.to_vec(),
|
||||
index: ValidatorIndex(index as _),
|
||||
proof,
|
||||
})
|
||||
.next().expect("There really should be 10 chunks.");
|
||||
(root, chunk)
|
||||
}
|
||||
@@ -0,0 +1,236 @@
|
||||
// Copyright 2021 Parity Technologies (UK) Ltd.
|
||||
// This file is part of Polkadot.
|
||||
|
||||
// Polkadot is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Polkadot is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//! Requester takes care of requesting erasure chunks for candidates that are pending
|
||||
//! availability.
|
||||
|
||||
use std::collections::{
|
||||
hash_map::{Entry, HashMap},
|
||||
hash_set::HashSet,
|
||||
};
|
||||
use std::iter::IntoIterator;
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
|
||||
use futures::{
|
||||
channel::mpsc,
|
||||
task::{Context, Poll},
|
||||
Stream,
|
||||
};
|
||||
|
||||
use sp_keystore::SyncCryptoStorePtr;
|
||||
|
||||
use polkadot_node_subsystem_util::request_availability_cores_ctx;
|
||||
use polkadot_primitives::v1::{CandidateHash, CoreState, Hash, OccupiedCore};
|
||||
use polkadot_subsystem::{
|
||||
messages::AllMessages, ActiveLeavesUpdate, jaeger, SubsystemContext,
|
||||
};
|
||||
|
||||
use super::{error::recv_runtime, session_cache::SessionCache, Result, LOG_TARGET, Metrics};
|
||||
|
||||
/// A task fetching a particular chunk.
|
||||
mod fetch_task;
|
||||
use fetch_task::{FetchTask, FetchTaskConfig, FromFetchTask};
|
||||
|
||||
/// Requester takes care of requesting erasure chunks from backing groups and stores them in the
|
||||
/// av store.
|
||||
///
|
||||
/// It implements a stream that needs to be advanced for it making progress.
|
||||
pub struct Requester {
|
||||
/// Candidates we need to fetch our chunk for.
|
||||
///
|
||||
/// We keep those around as long as a candidate is pending availability on some leaf, so we
|
||||
/// won't fetch chunks multiple times.
|
||||
fetches: HashMap<CandidateHash, FetchTask>,
|
||||
|
||||
/// Localized information about sessions we are currently interested in.
|
||||
session_cache: SessionCache,
|
||||
|
||||
/// Sender to be cloned for `FetchTask`s.
|
||||
tx: mpsc::Sender<FromFetchTask>,
|
||||
|
||||
/// Receive messages from `FetchTask`.
|
||||
rx: mpsc::Receiver<FromFetchTask>,
|
||||
|
||||
/// Prometheus Metrics
|
||||
metrics: Metrics,
|
||||
}
|
||||
|
||||
impl Requester {
|
||||
/// Create a new `Requester`.
|
||||
///
|
||||
/// You must feed it with `ActiveLeavesUpdate` via `update_fetching_heads` and make it progress
|
||||
/// by advancing the stream.
|
||||
pub fn new(keystore: SyncCryptoStorePtr, metrics: Metrics) -> Self {
|
||||
// All we do is forwarding messages, no need to make this big.
|
||||
// Each sender will get one slot, see
|
||||
// [here](https://docs.rs/futures/0.3.13/futures/channel/mpsc/fn.channel.html).
|
||||
let (tx, rx) = mpsc::channel(0);
|
||||
Requester {
|
||||
fetches: HashMap::new(),
|
||||
session_cache: SessionCache::new(keystore),
|
||||
tx,
|
||||
rx,
|
||||
metrics,
|
||||
}
|
||||
}
|
||||
/// Update heads that need availability distribution.
|
||||
///
|
||||
/// For all active heads we will be fetching our chunks for availabilty distribution.
|
||||
pub async fn update_fetching_heads<Context>(
|
||||
&mut self,
|
||||
ctx: &mut Context,
|
||||
update: ActiveLeavesUpdate,
|
||||
) -> Result<()>
|
||||
where
|
||||
Context: SubsystemContext,
|
||||
{
|
||||
let ActiveLeavesUpdate {
|
||||
activated,
|
||||
deactivated,
|
||||
} = update;
|
||||
// Order important! We need to handle activated, prior to deactivated, otherwise we might
|
||||
// cancel still needed jobs.
|
||||
self.start_requesting_chunks(ctx, activated.into_iter())
|
||||
.await?;
|
||||
self.stop_requesting_chunks(deactivated.into_iter());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Start requesting chunks for newly imported heads.
|
||||
async fn start_requesting_chunks<Context>(
|
||||
&mut self,
|
||||
ctx: &mut Context,
|
||||
new_heads: impl Iterator<Item = (Hash, Arc<jaeger::Span>)>,
|
||||
) -> Result<()>
|
||||
where
|
||||
Context: SubsystemContext,
|
||||
{
|
||||
for (leaf, _) in new_heads {
|
||||
let cores = query_occupied_cores(ctx, leaf).await?;
|
||||
self.add_cores(ctx, leaf, cores).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Stop requesting chunks for obsolete heads.
|
||||
///
|
||||
fn stop_requesting_chunks(&mut self, obsolete_leaves: impl Iterator<Item = Hash>) {
|
||||
let obsolete_leaves: HashSet<_> = obsolete_leaves.collect();
|
||||
self.fetches.retain(|_, task| {
|
||||
task.remove_leaves(&obsolete_leaves);
|
||||
task.is_live()
|
||||
})
|
||||
}
|
||||
|
||||
/// Add candidates corresponding for a particular relay parent.
|
||||
///
|
||||
/// Starting requests where necessary.
|
||||
///
|
||||
/// Note: The passed in `leaf` is not the same as CandidateDescriptor::relay_parent in the
|
||||
/// given cores. The latter is the relay_parent this candidate considers its parent, while the
|
||||
/// passed in leaf might be some later block where the candidate is still pending availability.
|
||||
async fn add_cores<Context>(
|
||||
&mut self,
|
||||
ctx: &mut Context,
|
||||
leaf: Hash,
|
||||
cores: impl IntoIterator<Item = OccupiedCore>,
|
||||
) -> Result<()>
|
||||
where
|
||||
Context: SubsystemContext,
|
||||
{
|
||||
for core in cores {
|
||||
match self.fetches.entry(core.candidate_hash) {
|
||||
Entry::Occupied(mut e) =>
|
||||
// Just book keeping - we are already requesting that chunk:
|
||||
{
|
||||
e.get_mut().add_leaf(leaf);
|
||||
}
|
||||
Entry::Vacant(e) => {
|
||||
let tx = self.tx.clone();
|
||||
let metrics = self.metrics.clone();
|
||||
|
||||
let task_cfg = self
|
||||
.session_cache
|
||||
.with_session_info(
|
||||
ctx,
|
||||
// We use leaf here, as relay_parent must be in the same session as the
|
||||
// leaf. (Cores are dropped at session boundaries.) At the same time,
|
||||
// only leaves are guaranteed to be fetchable by the state trie.
|
||||
leaf,
|
||||
|info| FetchTaskConfig::new(leaf, &core, tx, metrics, info),
|
||||
)
|
||||
.await?;
|
||||
|
||||
if let Some(task_cfg) = task_cfg {
|
||||
e.insert(FetchTask::start(task_cfg, ctx).await?);
|
||||
}
|
||||
// Not a validator, nothing to do.
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Stream for Requester {
|
||||
type Item = AllMessages;
|
||||
|
||||
fn poll_next(
|
||||
mut self: Pin<&mut Self>,
|
||||
ctx: &mut Context,
|
||||
) -> Poll<Option<AllMessages>> {
|
||||
loop {
|
||||
match Pin::new(&mut self.rx).poll_next(ctx) {
|
||||
Poll::Ready(Some(FromFetchTask::Message(m))) =>
|
||||
return Poll::Ready(Some(m)),
|
||||
Poll::Ready(Some(FromFetchTask::Concluded(Some(bad_boys)))) => {
|
||||
self.session_cache.report_bad_log(bad_boys);
|
||||
continue
|
||||
}
|
||||
Poll::Ready(Some(FromFetchTask::Concluded(None))) =>
|
||||
continue,
|
||||
Poll::Ready(None) =>
|
||||
return Poll::Ready(None),
|
||||
Poll::Pending =>
|
||||
return Poll::Pending,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Query all hashes and descriptors of candidates pending availability at a particular block.
|
||||
#[tracing::instrument(level = "trace", skip(ctx), fields(subsystem = LOG_TARGET))]
|
||||
async fn query_occupied_cores<Context>(
|
||||
ctx: &mut Context,
|
||||
relay_parent: Hash,
|
||||
) -> Result<Vec<OccupiedCore>>
|
||||
where
|
||||
Context: SubsystemContext,
|
||||
{
|
||||
let cores = recv_runtime(request_availability_cores_ctx(relay_parent, ctx).await).await?;
|
||||
|
||||
Ok(cores
|
||||
.into_iter()
|
||||
.filter_map(|core_state| {
|
||||
if let CoreState::Occupied(occupied) = core_state {
|
||||
Some(occupied)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
@@ -0,0 +1,97 @@
|
||||
// Copyright 2021 Parity Technologies (UK) Ltd.
|
||||
// This file is part of Polkadot.
|
||||
|
||||
// Polkadot is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Polkadot is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//! Answer requests for availability chunks.
|
||||
|
||||
use futures::channel::oneshot;
|
||||
|
||||
use polkadot_node_network_protocol::request_response::{request::IncomingRequest, v1};
|
||||
use polkadot_primitives::v1::{CandidateHash, ErasureChunk, ValidatorIndex};
|
||||
use polkadot_subsystem::{
|
||||
messages::{AllMessages, AvailabilityStoreMessage},
|
||||
SubsystemContext,
|
||||
};
|
||||
|
||||
use crate::error::{Error, Result};
|
||||
use crate::{LOG_TARGET, metrics::{Metrics, SUCCEEDED, FAILED, NOT_FOUND}};
|
||||
|
||||
/// Variant of `answer_request` that does Prometheus metric and logging on errors.
|
||||
///
|
||||
/// Any errors of `answer_request` will simply be logged.
|
||||
pub async fn answer_request_log<Context>(
|
||||
ctx: &mut Context,
|
||||
req: IncomingRequest<v1::AvailabilityFetchingRequest>,
|
||||
metrics: &Metrics,
|
||||
) -> ()
|
||||
where
|
||||
Context: SubsystemContext,
|
||||
{
|
||||
let res = answer_request(ctx, req).await;
|
||||
match res {
|
||||
Ok(result) =>
|
||||
metrics.on_served(if result {SUCCEEDED} else {NOT_FOUND}),
|
||||
Err(err) => {
|
||||
tracing::warn!(
|
||||
target: LOG_TARGET,
|
||||
err= ?err,
|
||||
"Serving chunk failed with error"
|
||||
);
|
||||
metrics.on_served(FAILED);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Answer an incoming chunk request by querying the av store.
|
||||
///
|
||||
/// Returns: Ok(true) if chunk was found and served.
|
||||
pub async fn answer_request<Context>(
|
||||
ctx: &mut Context,
|
||||
req: IncomingRequest<v1::AvailabilityFetchingRequest>,
|
||||
) -> Result<bool>
|
||||
where
|
||||
Context: SubsystemContext,
|
||||
{
|
||||
let chunk = query_chunk(ctx, req.payload.candidate_hash, req.payload.index).await?;
|
||||
|
||||
let result = chunk.is_some();
|
||||
|
||||
let response = match chunk {
|
||||
None => v1::AvailabilityFetchingResponse::NoSuchChunk,
|
||||
Some(chunk) => v1::AvailabilityFetchingResponse::Chunk(chunk.into()),
|
||||
};
|
||||
|
||||
req.send_response(response).map_err(|_| Error::SendResponse)?;
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Query chunk from the availability store.
|
||||
#[tracing::instrument(level = "trace", skip(ctx), fields(subsystem = LOG_TARGET))]
|
||||
async fn query_chunk<Context>(
|
||||
ctx: &mut Context,
|
||||
candidate_hash: CandidateHash,
|
||||
validator_index: ValidatorIndex,
|
||||
) -> Result<Option<ErasureChunk>>
|
||||
where
|
||||
Context: SubsystemContext,
|
||||
{
|
||||
let (tx, rx) = oneshot::channel();
|
||||
ctx.send_message(AllMessages::AvailabilityStore(
|
||||
AvailabilityStoreMessage::QueryChunk(candidate_hash, validator_index, tx),
|
||||
))
|
||||
.await;
|
||||
|
||||
rx.await.map_err(|e| Error::QueryChunkResponseChannel(e))
|
||||
}
|
||||
@@ -0,0 +1,275 @@
|
||||
// Copyright 2021 Parity Technologies (UK) Ltd.
|
||||
// This file is part of Polkadot.
|
||||
|
||||
// Polkadot is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
|
||||
// Polkadot is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
use std::collections::HashSet;
|
||||
|
||||
use lru::LruCache;
|
||||
use rand::{seq::SliceRandom, thread_rng};
|
||||
|
||||
use sp_application_crypto::AppKey;
|
||||
use sp_core::crypto::Public;
|
||||
use sp_keystore::{CryptoStore, SyncCryptoStorePtr};
|
||||
|
||||
use polkadot_node_subsystem_util::{
|
||||
request_session_index_for_child_ctx, request_session_info_ctx,
|
||||
};
|
||||
use polkadot_primitives::v1::SessionInfo as GlobalSessionInfo;
|
||||
use polkadot_primitives::v1::{
|
||||
AuthorityDiscoveryId, GroupIndex, Hash, SessionIndex, ValidatorId, ValidatorIndex,
|
||||
};
|
||||
use polkadot_subsystem::SubsystemContext;
|
||||
|
||||
use super::{
|
||||
error::{recv_runtime, Result},
|
||||
Error,
|
||||
LOG_TARGET,
|
||||
};
|
||||
|
||||
/// Caching of session info as needed by availability distribution.
|
||||
///
|
||||
/// It should be ensured that a cached session stays live in the cache as long as we might need it.
|
||||
pub struct SessionCache {
|
||||
/// Get the session index for a given relay parent.
|
||||
///
|
||||
/// We query this up to a 100 times per block, so caching it here without roundtrips over the
|
||||
/// overseer seems sensible.
|
||||
session_index_cache: LruCache<Hash, SessionIndex>,
|
||||
|
||||
/// Look up cached sessions by SessionIndex.
|
||||
///
|
||||
/// Note: Performance of fetching is really secondary here, but we need to ensure we are going
|
||||
/// to get any existing cache entry, before fetching new information, as we should not mess up
|
||||
/// the order of validators in `SessionInfo::validator_groups`. (We want live TCP connections
|
||||
/// wherever possible.)
|
||||
session_info_cache: LruCache<SessionIndex, SessionInfo>,
|
||||
|
||||
/// Key store for determining whether we are a validator and what `ValidatorIndex` we have.
|
||||
keystore: SyncCryptoStorePtr,
|
||||
}
|
||||
|
||||
/// Localized session information, tailored for the needs of availability distribution.
|
||||
#[derive(Clone)]
|
||||
pub struct SessionInfo {
|
||||
/// The index of this session.
|
||||
pub session_index: SessionIndex,
|
||||
|
||||
/// Validator groups of the current session.
|
||||
///
|
||||
/// Each group's order is randomized. This way we achieve load balancing when requesting
|
||||
/// chunks, as the validators in a group will be tried in that randomized order. Each node
|
||||
/// should arrive at a different order, therefore we distribute the load on individual
|
||||
/// validators.
|
||||
pub validator_groups: Vec<Vec<AuthorityDiscoveryId>>,
|
||||
|
||||
/// Information about ourself:
|
||||
pub our_index: ValidatorIndex,
|
||||
|
||||
/// Remember to which group we belong, so we won't start fetching chunks for candidates with
|
||||
/// our group being responsible. (We should have that chunk already.)
|
||||
pub our_group: GroupIndex,
|
||||
}
|
||||
|
||||
/// Report of bad validators.
|
||||
///
|
||||
/// Fetching tasks will report back validators that did not respond as expected, so we can re-order
|
||||
/// them.
|
||||
pub struct BadValidators {
|
||||
/// The session index that was used.
|
||||
pub session_index: SessionIndex,
|
||||
/// The group, the not properly responding validators belong to.
|
||||
pub group_index: GroupIndex,
|
||||
/// The list of bad validators.
|
||||
pub bad_validators: Vec<AuthorityDiscoveryId>,
|
||||
}
|
||||
|
||||
impl SessionCache {
|
||||
/// Create a new `SessionCache`.
|
||||
pub fn new(keystore: SyncCryptoStorePtr) -> Self {
|
||||
SessionCache {
|
||||
// 5 relatively conservative, 1 to 2 should suffice:
|
||||
session_index_cache: LruCache::new(5),
|
||||
// We need to cache the current and the last session the most:
|
||||
session_info_cache: LruCache::new(2),
|
||||
keystore,
|
||||
}
|
||||
}
|
||||
|
||||
/// Tries to retrieve `SessionInfo` and calls `with_info` if successful.
|
||||
///
|
||||
/// If this node is not a validator, the function will return `None`.
|
||||
///
|
||||
/// Use this function over any `fetch_session_info` if all you need is a reference to
|
||||
/// `SessionInfo`, as it avoids an expensive clone.
|
||||
pub async fn with_session_info<Context, F, R>(
|
||||
&mut self,
|
||||
ctx: &mut Context,
|
||||
parent: Hash,
|
||||
with_info: F,
|
||||
) -> Result<Option<R>>
|
||||
where
|
||||
Context: SubsystemContext,
|
||||
F: FnOnce(&SessionInfo) -> R,
|
||||
{
|
||||
let session_index = match self.session_index_cache.get(&parent) {
|
||||
Some(index) => *index,
|
||||
None => {
|
||||
let index =
|
||||
recv_runtime(request_session_index_for_child_ctx(parent, ctx).await)
|
||||
.await?;
|
||||
self.session_index_cache.put(parent, index);
|
||||
index
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(info) = self.session_info_cache.get(&session_index) {
|
||||
return Ok(Some(with_info(info)));
|
||||
}
|
||||
|
||||
if let Some(info) = self
|
||||
.query_info_from_runtime(ctx, parent, session_index)
|
||||
.await?
|
||||
{
|
||||
let r = with_info(&info);
|
||||
self.session_info_cache.put(session_index, info);
|
||||
return Ok(Some(r));
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
/// Variant of `report_bad` that never fails, but just logs errors.
|
||||
///
|
||||
/// Not being able to report bad validators is not fatal, so we should not shutdown the
|
||||
/// subsystem on this.
|
||||
pub fn report_bad_log(&mut self, report: BadValidators) {
|
||||
if let Err(err) = self.report_bad(report) {
|
||||
tracing::warn!(
|
||||
target: LOG_TARGET,
|
||||
err= ?err,
|
||||
"Reporting bad validators failed with error"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Make sure we try unresponsive or misbehaving validators last.
|
||||
///
|
||||
/// We assume validators in a group are tried in reverse order, so the reported bad validators
|
||||
/// will be put at the beginning of the group.
|
||||
#[tracing::instrument(level = "trace", skip(self, report), fields(subsystem = LOG_TARGET))]
|
||||
pub fn report_bad(&mut self, report: BadValidators) -> Result<()> {
|
||||
let session = self
|
||||
.session_info_cache
|
||||
.get_mut(&report.session_index)
|
||||
.ok_or(Error::NoSuchCachedSession)?;
|
||||
let group = session
|
||||
.validator_groups
|
||||
.get_mut(report.group_index.0 as usize)
|
||||
.expect("A bad validator report must contain a valid group for the reported session. qed.");
|
||||
let bad_set = report.bad_validators.iter().collect::<HashSet<_>>();
|
||||
|
||||
// Get rid of bad boys:
|
||||
group.retain(|v| !bad_set.contains(v));
|
||||
|
||||
// We are trying validators in reverse order, so bad ones should be first:
|
||||
let mut new_group = report.bad_validators;
|
||||
new_group.append(group);
|
||||
*group = new_group;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Query needed information from runtime.
|
||||
///
|
||||
/// We need to pass in the relay parent for our call to `request_session_info_ctx`. We should
|
||||
/// actually don't need that: I suppose it is used for internal caching based on relay parents,
|
||||
/// which we don't use here. It should not do any harm though.
|
||||
async fn query_info_from_runtime<Context>(
|
||||
&self,
|
||||
ctx: &mut Context,
|
||||
parent: Hash,
|
||||
session_index: SessionIndex,
|
||||
) -> Result<Option<SessionInfo>>
|
||||
where
|
||||
Context: SubsystemContext,
|
||||
{
|
||||
let GlobalSessionInfo {
|
||||
validators,
|
||||
discovery_keys,
|
||||
mut validator_groups,
|
||||
..
|
||||
} = recv_runtime(request_session_info_ctx(parent, session_index, ctx).await)
|
||||
.await?
|
||||
.ok_or(Error::NoSuchSession(session_index))?;
|
||||
|
||||
if let Some(our_index) = self.get_our_index(validators).await {
|
||||
// Get our group index:
|
||||
let our_group = validator_groups
|
||||
.iter()
|
||||
.enumerate()
|
||||
.find_map(|(i, g)| {
|
||||
g.iter().find_map(|v| {
|
||||
if *v == our_index {
|
||||
Some(GroupIndex(i as u32))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
})
|
||||
.expect("Every validator should be in a validator group. qed.");
|
||||
|
||||
// Shuffle validators in groups:
|
||||
let mut rng = thread_rng();
|
||||
for g in validator_groups.iter_mut() {
|
||||
g.shuffle(&mut rng)
|
||||
}
|
||||
// Look up `AuthorityDiscoveryId`s right away:
|
||||
let validator_groups: Vec<Vec<_>> = validator_groups
|
||||
.into_iter()
|
||||
.map(|group| {
|
||||
group
|
||||
.into_iter()
|
||||
.map(|index| {
|
||||
discovery_keys.get(index.0 as usize)
|
||||
.expect("There should be a discovery key for each validator of each validator group. qed.")
|
||||
.clone()
|
||||
})
|
||||
.collect()
|
||||
})
|
||||
.collect();
|
||||
|
||||
let info = SessionInfo {
|
||||
validator_groups,
|
||||
our_index,
|
||||
session_index,
|
||||
our_group,
|
||||
};
|
||||
return Ok(Some(info));
|
||||
}
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
/// Get our `ValidatorIndex`.
|
||||
///
|
||||
/// Returns: None if we are not a validator.
|
||||
async fn get_our_index(&self, validators: Vec<ValidatorId>) -> Option<ValidatorIndex> {
|
||||
for (i, v) in validators.iter().enumerate() {
|
||||
if CryptoStore::has_keys(&*self.keystore, &[(v.to_raw_vec(), ValidatorId::ID)])
|
||||
.await
|
||||
{
|
||||
return Some(ValidatorIndex(i as u32));
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user