Request based PoV distribution (#2640)

* Indentation fix.

* Prepare request-response for PoV fetching.

* Drop old PoV distribution.

* WIP: Fetch PoV directly from backing.

* Backing compiles.

* Runtime access and connection management for PoV distribution.

* Get rid of seemingly dead code.

* Implement PoV fetching.

Backing does not yet use it.

* Don't send `ConnectToValidators` for empty list.

* Even better - no need to check over and over again.

* PoV fetching implemented.

+ Typechecks
+ Should work

Missing:

- Guide
- Tests
- Do fallback fetching in case fetching from seconding validator fails.

* Check PoV hash upon reception.

* Implement retry of PoV fetching in backing.

* Avoid pointless validation spawning.

* Add jaeger span to pov requesting.

* Add back tracing.

* Review remarks.

* Whitespace.

* Whitespace again.

* Cleanup + fix tests.

* Log to log target in overseer.

* Fix more tests.

* Don't fail if group cannot be found.

* Simple test for PoV fetcher.

* Handle missing group membership better.

* Add test for retry functionality.

* Fix flaky test.

* Spaces again.

* Guide updates.

* Spaces.
This commit is contained in:
Robert Klotzner
2021-03-28 17:11:38 +02:00
committed by GitHub
parent 27b6d83974
commit c6f07d8f31
35 changed files with 1382 additions and 3184 deletions
@@ -17,20 +17,26 @@
//! Error handling related code and Error/Result definitions.
use polkadot_node_network_protocol::request_response::request::RequestError;
use thiserror::Error;
use futures::channel::oneshot;
use polkadot_node_subsystem_util::Error as UtilError;
use polkadot_primitives::v1::SessionIndex;
use polkadot_primitives::v1::{CompressedPoVError, SessionIndex};
use polkadot_subsystem::{errors::RuntimeApiError, SubsystemError};
use crate::LOG_TARGET;
/// Errors of this subsystem.
#[derive(Debug, Error)]
pub enum Error {
#[error("Response channel to obtain QueryChunk failed")]
#[error("Response channel to obtain chunk failed")]
QueryChunkResponseChannel(#[source] oneshot::Canceled),
#[error("Response channel to obtain available data failed")]
QueryAvailableDataResponseChannel(#[source] oneshot::Canceled),
#[error("Receive channel closed")]
IncomingMessageChannel(#[source] SubsystemError),
@@ -53,24 +59,43 @@ pub enum Error {
/// Sending response failed.
#[error("Sending a request's response failed.")]
SendResponse,
}
/// Error that we should handle gracefully by logging it.
#[derive(Debug)]
pub enum NonFatalError {
/// Some request to utility functions failed.
/// This can be either `RuntimeRequestCanceled` or `RuntimeApiError`.
#[error("Utility request failed")]
UtilRequest(UtilError),
/// Runtime API subsystem is down, which means we're shutting down.
#[error("Runtime request canceled")]
RuntimeRequestCanceled(oneshot::Canceled),
/// Some request to the runtime failed.
/// For example if we prune a block we're requesting info about.
#[error("Runtime API error")]
RuntimeRequest(RuntimeApiError),
/// We tried fetching a session info which was not available.
#[error("There was no session with the given index")]
NoSuchSession(SessionIndex),
/// Decompressing PoV failed.
#[error("PoV could not be decompressed")]
PoVDecompression(CompressedPoVError),
/// Fetching PoV failed with `RequestError`.
#[error("FetchPoV request error")]
FetchPoV(#[source] RequestError),
/// Fetching PoV failed as the received PoV did not match the expected hash.
#[error("Fetched PoV does not match expected hash")]
UnexpectedPoV,
#[error("Remote responded with `NoSuchPoV`")]
NoSuchPoV,
/// No validator with the index could be found in current session.
#[error("Given validator index could not be found")]
InvalidValidatorIndex,
}
pub type Result<T> = std::result::Result<T, Error>;
@@ -87,9 +112,20 @@ pub(crate) async fn recv_runtime<V>(
oneshot::Receiver<std::result::Result<V, RuntimeApiError>>,
UtilError,
>,
) -> std::result::Result<V, NonFatalError> {
r.map_err(NonFatalError::UtilRequest)?
) -> std::result::Result<V, Error> {
r.map_err(Error::UtilRequest)?
.await
.map_err(NonFatalError::RuntimeRequestCanceled)?
.map_err(NonFatalError::RuntimeRequest)
.map_err(Error::RuntimeRequestCanceled)?
.map_err(Error::RuntimeRequest)
}
/// Utility for eating top level errors and log them.
///
/// We basically always want to try and continue on error. This utility function is meant to
/// consume top-level errors by simply logging them
pub fn log_error(result: Result<()>, ctx: &'static str) {
if let Err(error) = result {
tracing::warn!(target: LOG_TARGET, error = ?error, ctx);
}
}
@@ -26,15 +26,23 @@ use polkadot_subsystem::{
/// Error and [`Result`] type for this subsystem.
mod error;
pub use error::Error;
use error::Result;
use error::{Result, log_error};
/// Runtime requests.
mod runtime;
use runtime::Runtime;
/// `Requester` taking care of requesting chunks for candidates pending availability.
mod requester;
use requester::Requester;
/// Handing requests for PoVs during backing.
mod pov_requester;
use pov_requester::PoVRequester;
/// Responding to erasure chunk requests:
mod responder;
use responder::answer_request_log;
use responder::{answer_chunk_request_log, answer_pov_request_log};
/// Cache for session information.
mod session_cache;
@@ -52,6 +60,8 @@ const LOG_TARGET: &'static str = "parachain::availability-distribution";
pub struct AvailabilityDistributionSubsystem {
/// Pointer to a keystore, which is required for determining this nodes validator index.
keystore: SyncCryptoStorePtr,
/// Easy and efficient runtime access for this subsystem.
runtime: Runtime,
/// Prometheus metrics.
metrics: Metrics,
}
@@ -74,17 +84,20 @@ where
}
impl AvailabilityDistributionSubsystem {
/// Create a new instance of the availability distribution.
pub fn new(keystore: SyncCryptoStorePtr, metrics: Metrics) -> Self {
Self { keystore, metrics }
let runtime = Runtime::new(keystore.clone());
Self { keystore, runtime, metrics }
}
/// Start processing work as passed on from the Overseer.
async fn run<Context>(self, mut ctx: Context) -> Result<()>
async fn run<Context>(mut self, mut ctx: Context) -> Result<()>
where
Context: SubsystemContext<Message = AvailabilityDistributionMessage> + Sync + Send,
{
let mut requester = Requester::new(self.keystore.clone(), self.metrics.clone()).fuse();
let mut pov_requester = PoVRequester::new();
loop {
let action = {
let mut subsystem_next = ctx.recv().fuse();
@@ -107,14 +120,14 @@ impl AvailabilityDistributionSubsystem {
};
match message {
FromOverseer::Signal(OverseerSignal::ActiveLeaves(update)) => {
// Update the relay chain heads we are fetching our pieces for:
if let Some(e) = requester
.get_mut()
.update_fetching_heads(&mut ctx, update)
.await?
{
tracing::debug!(target: LOG_TARGET, "Error processing ActiveLeavesUpdate: {:?}", e);
}
log_error(
pov_requester.update_connected_validators(&mut ctx, &mut self.runtime, &update).await,
"PoVRequester::update_connected_validators"
);
log_error(
requester.get_mut().update_fetching_heads(&mut ctx, update).await,
"Error in Requester::update_fetching_heads"
);
}
FromOverseer::Signal(OverseerSignal::BlockFinalized(..)) => {}
FromOverseer::Signal(OverseerSignal::Conclude) => {
@@ -123,7 +136,34 @@ impl AvailabilityDistributionSubsystem {
FromOverseer::Communication {
msg: AvailabilityDistributionMessage::ChunkFetchingRequest(req),
} => {
answer_request_log(&mut ctx, req, &self.metrics).await
answer_chunk_request_log(&mut ctx, req, &self.metrics).await
}
FromOverseer::Communication {
msg: AvailabilityDistributionMessage::PoVFetchingRequest(req),
} => {
answer_pov_request_log(&mut ctx, req, &self.metrics).await
}
FromOverseer::Communication {
msg: AvailabilityDistributionMessage::FetchPoV {
relay_parent,
from_validator,
candidate_hash,
pov_hash,
tx,
},
} => {
log_error(
pov_requester.fetch_pov(
&mut ctx,
&mut self.runtime,
relay_parent,
from_validator,
candidate_hash,
pov_hash,
tx,
).await,
"PoVRequester::fetch_pov"
);
}
}
}
@@ -24,7 +24,7 @@ pub const SUCCEEDED: &'static str = "succeeded";
/// Label for fail counters.
pub const FAILED: &'static str = "failed";
/// Label for chunks that could not be served, because they were not available.
/// Label for chunks/PoVs that could not be served, because they were not available.
pub const NOT_FOUND: &'static str = "not-found";
/// Availability Distribution metrics.
@@ -47,6 +47,12 @@ struct MetricsInner {
/// to a chunk request. This includes `NoSuchChunk` responses.
served_chunks: CounterVec<U64>,
/// Number of PoVs served.
///
/// Note: Right now, `Succeeded` gets incremented whenever we were able to successfully respond
/// to a PoV request. This includes `NoSuchPoV` responses.
served_povs: CounterVec<U64>,
/// Number of times our first set of validators did not provide the needed chunk and we had to
/// query further validators.
retries: Counter<U64>,
@@ -66,12 +72,19 @@ impl Metrics {
}
/// Increment counter on served chunks.
pub fn on_served(&self, label: &'static str) {
pub fn on_served_chunk(&self, label: &'static str) {
if let Some(metrics) = &self.0 {
metrics.served_chunks.with_label_values(&[label]).inc()
}
}
/// Increment counter on served PoVs.
pub fn on_served_pov(&self, label: &'static str) {
if let Some(metrics) = &self.0 {
metrics.served_povs.with_label_values(&[label]).inc()
}
}
/// Increment retry counter.
pub fn on_retry(&self) {
if let Some(metrics) = &self.0 {
@@ -103,6 +116,16 @@ impl metrics::Metrics for Metrics {
)?,
registry,
)?,
served_povs: prometheus::register(
CounterVec::new(
Opts::new(
"parachain_served_povs_total",
"Total number of povs served by this backer.",
),
&["success"]
)?,
registry,
)?,
retries: prometheus::register(
Counter::new(
"parachain_fetch_retries_total",
@@ -0,0 +1,333 @@
// Copyright 2021 Parity Technologies (UK) Ltd.
// This file is part of Polkadot.
// Polkadot is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Polkadot is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
//! PoV requester takes care of requesting PoVs from validators of a backing group.
use futures::{FutureExt, channel::{mpsc, oneshot}, future::BoxFuture};
use lru::LruCache;
use polkadot_subsystem::jaeger;
use polkadot_node_network_protocol::{
PeerId, peer_set::PeerSet,
request_response::{OutgoingRequest, Recipient, request::{RequestError, Requests},
v1::{PoVFetchingRequest, PoVFetchingResponse}}
};
use polkadot_primitives::v1::{
AuthorityDiscoveryId, CandidateHash, Hash, PoV, SessionIndex, ValidatorIndex
};
use polkadot_subsystem::{
ActiveLeavesUpdate, SubsystemContext, ActivatedLeaf,
messages::{AllMessages, NetworkBridgeMessage, IfDisconnected}
};
use crate::{error::{Error, log_error}, runtime::{Runtime, ValidatorInfo}};
/// Number of sessions we want to keep in the LRU.
const NUM_SESSIONS: usize = 2;
pub struct PoVRequester {
/// We only ever care about being connected to validators of at most two sessions.
///
/// So we keep an LRU for managing connection requests of size 2.
/// Cache will contain `None` if we are not a validator in that session.
connected_validators: LruCache<SessionIndex, Option<mpsc::Receiver<(AuthorityDiscoveryId, PeerId)>>>,
}
impl PoVRequester {
/// Create a new requester for PoVs.
pub fn new() -> Self {
Self {
connected_validators: LruCache::new(NUM_SESSIONS),
}
}
/// Make sure we are connected to the right set of validators.
///
/// On every `ActiveLeavesUpdate`, we check whether we are connected properly to our current
/// validator group.
pub async fn update_connected_validators<Context>(
&mut self,
ctx: &mut Context,
runtime: &mut Runtime,
update: &ActiveLeavesUpdate,
) -> super::Result<()>
where
Context: SubsystemContext,
{
let activated = update.activated.iter().map(|ActivatedLeaf { hash: h, .. }| h);
let activated_sessions =
get_activated_sessions(ctx, runtime, activated).await?;
for (parent, session_index) in activated_sessions {
if self.connected_validators.contains(&session_index) {
continue
}
let rx = connect_to_relevant_validators(ctx, runtime, parent, session_index).await?;
self.connected_validators.put(session_index, rx);
}
Ok(())
}
/// Start background worker for taking care of fetching the requested `PoV` from the network.
pub async fn fetch_pov<Context>(
&self,
ctx: &mut Context,
runtime: &mut Runtime,
parent: Hash,
from_validator: ValidatorIndex,
candidate_hash: CandidateHash,
pov_hash: Hash,
tx: oneshot::Sender<PoV>
) -> super::Result<()>
where
Context: SubsystemContext,
{
let info = &runtime.get_session_info(ctx, parent).await?.session_info;
let authority_id = info.discovery_keys.get(from_validator.0 as usize)
.ok_or(Error::InvalidValidatorIndex)?
.clone();
let (req, pending_response) = OutgoingRequest::new(
Recipient::Authority(authority_id),
PoVFetchingRequest {
candidate_hash,
},
);
let full_req = Requests::PoVFetching(req);
ctx.send_message(
AllMessages::NetworkBridge(
NetworkBridgeMessage::SendRequests(
vec![full_req],
// We are supposed to be connected to validators of our group via `PeerSet`,
// but at session boundaries that is kind of racy, in case a connection takes
// longer to get established, so we try to connect in any case.
IfDisconnected::TryConnect
)
)).await;
let span = jaeger::Span::new(candidate_hash, "fetch-pov")
.with_validator_index(from_validator);
ctx.spawn("pov-fetcher", fetch_pov_job(pov_hash, pending_response.boxed(), span, tx).boxed())
.await
.map_err(|e| Error::SpawnTask(e))
}
}
/// Future to be spawned for taking care of handling reception and sending of PoV.
async fn fetch_pov_job(
pov_hash: Hash,
pending_response: BoxFuture<'static, Result<PoVFetchingResponse, RequestError>>,
span: jaeger::Span,
tx: oneshot::Sender<PoV>,
) {
log_error(
do_fetch_pov(pov_hash, pending_response, span, tx).await,
"fetch_pov_job",
)
}
/// Do the actual work of waiting for the response.
async fn do_fetch_pov(
pov_hash: Hash,
pending_response: BoxFuture<'static, Result<PoVFetchingResponse, RequestError>>,
_span: jaeger::Span,
tx: oneshot::Sender<PoV>,
)
-> super::Result<()>
{
let response = pending_response.await.map_err(Error::FetchPoV)?;
let pov = match response {
PoVFetchingResponse::PoV(compressed) => {
compressed.decompress().map_err(Error::PoVDecompression)?
}
PoVFetchingResponse::NoSuchPoV => {
return Err(Error::NoSuchPoV)
}
};
if pov.hash() == pov_hash {
tx.send(pov).map_err(|_| Error::SendResponse)
} else {
Err(Error::UnexpectedPoV)
}
}
/// Get the session indeces for the given relay chain parents.
async fn get_activated_sessions<Context>(ctx: &mut Context, runtime: &mut Runtime, new_heads: impl Iterator<Item = &Hash>)
-> super::Result<impl Iterator<Item = (Hash, SessionIndex)>>
where
Context: SubsystemContext,
{
let mut sessions = Vec::new();
for parent in new_heads {
sessions.push((*parent, runtime.get_session_index(ctx, *parent).await?));
}
Ok(sessions.into_iter())
}
/// Connect to validators of our validator group.
async fn connect_to_relevant_validators<Context>(
ctx: &mut Context,
runtime: &mut Runtime,
parent: Hash,
session: SessionIndex
)
-> super::Result<Option<mpsc::Receiver<(AuthorityDiscoveryId, PeerId)>>>
where
Context: SubsystemContext,
{
if let Some(validator_ids) = determine_relevant_validators(ctx, runtime, parent, session).await? {
// We don't actually care about `PeerId`s, just keeping receiver so we stay connected:
let (tx, rx) = mpsc::channel(0);
ctx.send_message(AllMessages::NetworkBridge(NetworkBridgeMessage::ConnectToValidators {
validator_ids, peer_set: PeerSet::Validation, connected: tx
})).await;
Ok(Some(rx))
} else {
Ok(None)
}
}
/// Get the validators in our validator group.
///
/// Return: `None` if not a validator.
async fn determine_relevant_validators<Context>(
ctx: &mut Context,
runtime: &mut Runtime,
parent: Hash,
session: SessionIndex,
)
-> super::Result<Option<Vec<AuthorityDiscoveryId>>>
where
Context: SubsystemContext,
{
let info = runtime.get_session_info_by_index(ctx, parent, session).await?;
if let ValidatorInfo {
our_index: Some(our_index),
our_group: Some(our_group)
} = &info.validator_info {
let indeces = info.session_info.validator_groups.get(our_group.0 as usize)
.expect("Our group got retrieved from that session info, it must exist. qed.")
.clone();
Ok(Some(
indeces.into_iter()
.filter(|i| *i != *our_index)
.map(|i| info.session_info.discovery_keys[i.0 as usize].clone())
.collect()
))
} else {
Ok(None)
}
}
#[cfg(test)]
mod tests {
use assert_matches::assert_matches;
use futures::{executor, future};
use parity_scale_codec::Encode;
use sp_core::testing::TaskExecutor;
use polkadot_primitives::v1::{BlockData, CandidateHash, CompressedPoV, Hash, ValidatorIndex};
use polkadot_subsystem_testhelpers as test_helpers;
use polkadot_subsystem::messages::{AvailabilityDistributionMessage, RuntimeApiMessage, RuntimeApiRequest};
use super::*;
use crate::LOG_TARGET;
use crate::tests::mock::{make_session_info, make_ferdie_keystore};
#[test]
fn rejects_invalid_pov() {
sp_tracing::try_init_simple();
let pov = PoV {
block_data: BlockData(vec![1,2,3,4,5,6]),
};
test_run(Hash::default(), pov);
}
#[test]
fn accepts_valid_pov() {
sp_tracing::try_init_simple();
let pov = PoV {
block_data: BlockData(vec![1,2,3,4,5,6]),
};
test_run(pov.hash(), pov);
}
fn test_run(pov_hash: Hash, pov: PoV) {
let requester = PoVRequester::new();
let pool = TaskExecutor::new();
let (mut context, mut virtual_overseer) =
test_helpers::make_subsystem_context::<AvailabilityDistributionMessage, TaskExecutor>(pool.clone());
let keystore = make_ferdie_keystore();
let mut runtime = crate::runtime::Runtime::new(keystore);
let (tx, rx) = oneshot::channel();
let testee = async {
requester.fetch_pov(
&mut context,
&mut runtime,
Hash::default(),
ValidatorIndex(0),
CandidateHash::default(),
pov_hash,
tx,
).await.expect("Should succeed");
};
let tester = async move {
loop {
match virtual_overseer.recv().await {
AllMessages::RuntimeApi(
RuntimeApiMessage::Request(
_,
RuntimeApiRequest::SessionIndexForChild(tx)
)
) => {
tx.send(Ok(0)).unwrap();
}
AllMessages::RuntimeApi(
RuntimeApiMessage::Request(
_,
RuntimeApiRequest::SessionInfo(_, tx)
)
) => {
tx.send(Ok(Some(make_session_info()))).unwrap();
}
AllMessages::NetworkBridge(NetworkBridgeMessage::SendRequests(mut reqs, _)) => {
let req = assert_matches!(
reqs.pop(),
Some(Requests::PoVFetching(outgoing)) => {outgoing}
);
req.pending_response.send(Ok(PoVFetchingResponse::PoV(
CompressedPoV::compress(&pov).unwrap()).encode()
)).unwrap();
break
},
msg => tracing::debug!(target: LOG_TARGET, msg = ?msg, "Received msg"),
}
}
if pov.hash() == pov_hash {
assert_eq!(rx.await, Ok(pov));
} else {
assert_eq!(rx.await, Err(oneshot::Canceled));
}
};
futures::pin_mut!(testee);
futures::pin_mut!(tester);
executor::block_on(future::join(testee, tester));
}
}
@@ -138,7 +138,7 @@ impl FetchTaskConfig {
let live_in = vec![leaf].into_iter().collect();
// Don't run tasks for our backing group:
if session_info.our_group == core.group_responsible {
if session_info.our_group == Some(core.group_responsible) {
return FetchTaskConfig {
live_in,
prepared_running: None,
@@ -39,7 +39,7 @@ use polkadot_subsystem::{
};
use super::{error::recv_runtime, session_cache::SessionCache, LOG_TARGET, Metrics};
use crate::error::NonFatalError;
use crate::error::Error;
/// A task fetching a particular chunk.
mod fetch_task;
@@ -96,7 +96,7 @@ impl Requester {
&mut self,
ctx: &mut Context,
update: ActiveLeavesUpdate,
) -> super::Result<Option<NonFatalError>>
) -> super::Result<()>
where
Context: SubsystemContext,
{
@@ -111,9 +111,9 @@ impl Requester {
} = update;
// Order important! We need to handle activated, prior to deactivated, otherwise we might
// cancel still needed jobs.
let err = self.start_requesting_chunks(ctx, activated.into_iter()).await?;
self.start_requesting_chunks(ctx, activated.into_iter()).await?;
self.stop_requesting_chunks(deactivated.into_iter());
Ok(err)
Ok(())
}
/// Start requesting chunks for newly imported heads.
@@ -121,25 +121,20 @@ impl Requester {
&mut self,
ctx: &mut Context,
new_heads: impl Iterator<Item = ActivatedLeaf>,
) -> super::Result<Option<NonFatalError>>
) -> super::Result<()>
where
Context: SubsystemContext,
{
for ActivatedLeaf { hash: leaf, .. } in new_heads {
let cores = match query_occupied_cores(ctx, leaf).await {
Err(err) => return Ok(Some(err)),
Ok(cores) => cores,
};
let cores = query_occupied_cores(ctx, leaf).await?;
tracing::trace!(
target: LOG_TARGET,
occupied_cores = ?cores,
"Query occupied core"
);
if let Some(err) = self.add_cores(ctx, leaf, cores).await? {
return Ok(Some(err));
}
self.add_cores(ctx, leaf, cores).await?;
}
Ok(None)
Ok(())
}
/// Stop requesting chunks for obsolete heads.
@@ -164,7 +159,7 @@ impl Requester {
ctx: &mut Context,
leaf: Hash,
cores: impl IntoIterator<Item = OccupiedCore>,
) -> super::Result<Option<NonFatalError>>
) -> super::Result<()>
where
Context: SubsystemContext,
{
@@ -179,7 +174,7 @@ impl Requester {
let tx = self.tx.clone();
let metrics = self.metrics.clone();
let task_cfg = match self
let task_cfg = self
.session_cache
.with_session_info(
ctx,
@@ -189,11 +184,7 @@ impl Requester {
leaf,
|info| FetchTaskConfig::new(leaf, &core, tx, metrics, info),
)
.await
{
Err(err) => return Ok(Some(err)),
Ok(task_cfg) => task_cfg,
};
.await?;
if let Some(task_cfg) = task_cfg {
e.insert(FetchTask::start(task_cfg, ctx).await?);
@@ -202,7 +193,7 @@ impl Requester {
}
}
}
Ok(None)
Ok(())
}
}
@@ -237,7 +228,7 @@ impl Stream for Requester {
async fn query_occupied_cores<Context>(
ctx: &mut Context,
relay_parent: Hash,
) -> Result<Vec<OccupiedCore>, NonFatalError>
) -> Result<Vec<OccupiedCore>, Error>
where
Context: SubsystemContext,
{
@@ -19,7 +19,7 @@
use futures::channel::oneshot;
use polkadot_node_network_protocol::request_response::{request::IncomingRequest, v1};
use polkadot_primitives::v1::{CandidateHash, ErasureChunk, ValidatorIndex};
use polkadot_primitives::v1::{AvailableData, CandidateHash, CompressedPoV, ErasureChunk, ValidatorIndex};
use polkadot_subsystem::{
messages::{AllMessages, AvailabilityStoreMessage},
SubsystemContext, jaeger,
@@ -28,10 +28,36 @@ use polkadot_subsystem::{
use crate::error::{Error, Result};
use crate::{LOG_TARGET, metrics::{Metrics, SUCCEEDED, FAILED, NOT_FOUND}};
/// Variant of `answer_request` that does Prometheus metric and logging on errors.
/// Variant of `answer_pov_request` that does Prometheus metric and logging on errors.
///
/// Any errors of `answer_pov_request` will simply be logged.
pub async fn answer_pov_request_log<Context>(
ctx: &mut Context,
req: IncomingRequest<v1::PoVFetchingRequest>,
metrics: &Metrics,
)
where
Context: SubsystemContext,
{
let res = answer_pov_request(ctx, req).await;
match res {
Ok(result) =>
metrics.on_served_pov(if result {SUCCEEDED} else {NOT_FOUND}),
Err(err) => {
tracing::warn!(
target: LOG_TARGET,
err= ?err,
"Serving PoV failed with error"
);
metrics.on_served_pov(FAILED);
}
}
}
/// Variant of `answer_chunk_request` that does Prometheus metric and logging on errors.
///
/// Any errors of `answer_request` will simply be logged.
pub async fn answer_request_log<Context>(
pub async fn answer_chunk_request_log<Context>(
ctx: &mut Context,
req: IncomingRequest<v1::ChunkFetchingRequest>,
metrics: &Metrics,
@@ -39,33 +65,71 @@ pub async fn answer_request_log<Context>(
where
Context: SubsystemContext,
{
let res = answer_request(ctx, req).await;
let res = answer_chunk_request(ctx, req).await;
match res {
Ok(result) =>
metrics.on_served(if result {SUCCEEDED} else {NOT_FOUND}),
metrics.on_served_chunk(if result {SUCCEEDED} else {NOT_FOUND}),
Err(err) => {
tracing::warn!(
target: LOG_TARGET,
err= ?err,
"Serving chunk failed with error"
);
metrics.on_served(FAILED);
metrics.on_served_chunk(FAILED);
}
}
}
/// Answer an incoming PoV fetch request by querying the av store.
///
/// Returns: Ok(true) if chunk was found and served.
pub async fn answer_pov_request<Context>(
ctx: &mut Context,
req: IncomingRequest<v1::PoVFetchingRequest>,
) -> Result<bool>
where
Context: SubsystemContext,
{
let _span = jaeger::Span::new(req.payload.candidate_hash, "answer-pov-request");
let av_data = query_available_data(ctx, req.payload.candidate_hash).await?;
let result = av_data.is_some();
let response = match av_data {
None => v1::PoVFetchingResponse::NoSuchPoV,
Some(av_data) => {
let pov = match CompressedPoV::compress(&av_data.pov) {
Ok(pov) => pov,
Err(error) => {
tracing::error!(
target: LOG_TARGET,
error = ?error,
"Failed to create `CompressedPov`",
);
// this should really not happen, let this request time out:
return Err(Error::PoVDecompression(error))
}
};
v1::PoVFetchingResponse::PoV(pov)
}
};
req.send_response(response).map_err(|_| Error::SendResponse)?;
Ok(result)
}
/// Answer an incoming chunk request by querying the av store.
///
/// Returns: Ok(true) if chunk was found and served.
pub async fn answer_request<Context>(
pub async fn answer_chunk_request<Context>(
ctx: &mut Context,
req: IncomingRequest<v1::ChunkFetchingRequest>,
) -> Result<bool>
where
Context: SubsystemContext,
{
let span = jaeger::Span::new(req.payload.candidate_hash, "answer-request")
.with_stage(jaeger::Stage::AvailabilityDistribution);
let span = jaeger::Span::new(req.payload.candidate_hash, "answer-chunk-request");
let _child_span = span.child("answer-chunk-request")
.with_chunk_index(req.payload.index.0);
@@ -119,3 +183,21 @@ where
Error::QueryChunkResponseChannel(e)
})
}
/// Query PoV from the availability store.
#[tracing::instrument(level = "trace", skip(ctx), fields(subsystem = LOG_TARGET))]
async fn query_available_data<Context>(
ctx: &mut Context,
candidate_hash: CandidateHash,
) -> Result<Option<AvailableData>>
where
Context: SubsystemContext,
{
let (tx, rx) = oneshot::channel();
ctx.send_message(AllMessages::AvailabilityStore(
AvailabilityStoreMessage::QueryAvailableData(candidate_hash, tx),
))
.await;
rx.await.map_err(|e| Error::QueryAvailableDataResponseChannel(e))
}
@@ -0,0 +1,197 @@
// Copyright 2021 Parity Technologies (UK) Ltd.
// This file is part of Polkadot.
// Polkadot is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Polkadot is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
//! Convenient interface to the runtime.
use lru::LruCache;
use sp_application_crypto::AppKey;
use sp_core::crypto::Public;
use sp_keystore::{CryptoStore, SyncCryptoStorePtr};
use polkadot_node_subsystem_util::{
request_session_index_for_child_ctx, request_session_info_ctx,
};
use polkadot_primitives::v1::{GroupIndex, Hash, SessionIndex, SessionInfo, ValidatorId, ValidatorIndex};
use polkadot_subsystem::SubsystemContext;
use super::{
error::recv_runtime,
Error,
};
/// Caching of session info as needed by availability distribution.
///
/// It should be ensured that a cached session stays live in the cache as long as we might need it.
pub struct Runtime {
/// Get the session index for a given relay parent.
///
/// We query this up to a 100 times per block, so caching it here without roundtrips over the
/// overseer seems sensible.
session_index_cache: LruCache<Hash, SessionIndex>,
/// Look up cached sessions by SessionIndex.
session_info_cache: LruCache<SessionIndex, ExtendedSessionInfo>,
/// Key store for determining whether we are a validator and what `ValidatorIndex` we have.
keystore: SyncCryptoStorePtr,
}
/// SessionInfo with additional useful data for validator nodes.
pub struct ExtendedSessionInfo {
/// Actual session info as fetched from the runtime.
pub session_info: SessionInfo,
/// Contains useful information about ourselves, in case this node is a validator.
pub validator_info: ValidatorInfo,
}
/// Information about ourself, in case we are an `Authority`.
///
/// This data is derived from the `SessionInfo` and our key as found in the keystore.
pub struct ValidatorInfo {
/// The index this very validator has in `SessionInfo` vectors, if any.
pub our_index: Option<ValidatorIndex>,
/// The group we belong to, if any.
pub our_group: Option<GroupIndex>,
}
impl Runtime {
/// Create a new `Runtime` for convenient runtime fetches.
pub fn new(keystore: SyncCryptoStorePtr) -> Self {
Self {
// 5 relatively conservative, 1 to 2 should suffice:
session_index_cache: LruCache::new(5),
// We need to cache the current and the last session the most:
session_info_cache: LruCache::new(2),
keystore,
}
}
/// Retrieve the current session index.
pub async fn get_session_index<Context>(
&mut self,
ctx: &mut Context,
parent: Hash,
) -> Result<SessionIndex, Error>
where
Context: SubsystemContext,
{
match self.session_index_cache.get(&parent) {
Some(index) => Ok(*index),
None => {
let index =
recv_runtime(request_session_index_for_child_ctx(parent, ctx).await)
.await?;
self.session_index_cache.put(parent, index);
Ok(index)
}
}
}
/// Get `ExtendedSessionInfo` by relay parent hash.
pub async fn get_session_info<'a, Context>(
&'a mut self,
ctx: &mut Context,
parent: Hash,
) -> Result<&'a ExtendedSessionInfo, Error>
where
Context: SubsystemContext,
{
let session_index = self.get_session_index(ctx, parent).await?;
self.get_session_info_by_index(ctx, parent, session_index).await
}
/// Get `ExtendedSessionInfo` by session index.
///
/// `request_session_info_ctx` still requires the parent to be passed in, so we take the parent
/// in addition to the `SessionIndex`.
pub async fn get_session_info_by_index<'a, Context>(
&'a mut self,
ctx: &mut Context,
parent: Hash,
session_index: SessionIndex,
) -> Result<&'a ExtendedSessionInfo, Error>
where
Context: SubsystemContext,
{
if !self.session_info_cache.contains(&session_index) {
let session_info =
recv_runtime(request_session_info_ctx(parent, session_index, ctx).await)
.await?
.ok_or(Error::NoSuchSession(session_index))?;
let validator_info = self.get_validator_info(&session_info).await?;
let full_info = ExtendedSessionInfo {
session_info,
validator_info,
};
self.session_info_cache.put(session_index, full_info);
}
Ok(
self.session_info_cache.get(&session_index)
.expect("We just put the value there. qed.")
)
}
/// Build `ValidatorInfo` for the current session.
///
///
/// Returns: `None` if not a validator.
async fn get_validator_info(
&self,
session_info: &SessionInfo,
) -> Result<ValidatorInfo, Error>
{
if let Some(our_index) = self.get_our_index(&session_info.validators).await {
// Get our group index:
let our_group = session_info.validator_groups
.iter()
.enumerate()
.find_map(|(i, g)| {
g.iter().find_map(|v| {
if *v == our_index {
Some(GroupIndex(i as u32))
} else {
None
}
})
}
);
let info = ValidatorInfo {
our_index: Some(our_index),
our_group,
};
return Ok(info)
}
return Ok(ValidatorInfo { our_index: None, our_group: None })
}
/// Get our `ValidatorIndex`.
///
/// Returns: None if we are not a validator.
async fn get_our_index(&self, validators: &[ValidatorId]) -> Option<ValidatorIndex> {
for (i, v) in validators.iter().enumerate() {
if CryptoStore::has_keys(&*self.keystore, &[(v.to_raw_vec(), ValidatorId::ID)])
.await
{
return Some(ValidatorIndex(i as u32));
}
}
None
}
}
@@ -33,8 +33,7 @@ use polkadot_primitives::v1::{
use polkadot_subsystem::SubsystemContext;
use super::{
error::{recv_runtime, NonFatalError},
Error,
error::{recv_runtime, Error},
LOG_TARGET,
};
@@ -82,7 +81,9 @@ pub struct SessionInfo {
/// Remember to which group we belong, so we won't start fetching chunks for candidates with
/// our group being responsible. (We should have that chunk already.)
pub our_group: GroupIndex,
///
/// `None`, if we are not in fact part of any group.
pub our_group: Option<GroupIndex>,
}
/// Report of bad validators.
@@ -122,7 +123,7 @@ impl SessionCache {
ctx: &mut Context,
parent: Hash,
with_info: F,
) -> Result<Option<R>, NonFatalError>
) -> Result<Option<R>, Error>
where
Context: SubsystemContext,
F: FnOnce(&SessionInfo) -> R,
@@ -219,7 +220,7 @@ impl SessionCache {
ctx: &mut Context,
parent: Hash,
session_index: SessionIndex,
) -> Result<Option<SessionInfo>, NonFatalError>
) -> Result<Option<SessionInfo>, Error>
where
Context: SubsystemContext,
{
@@ -230,7 +231,7 @@ impl SessionCache {
..
} = recv_runtime(request_session_info_ctx(parent, session_index, ctx).await)
.await?
.ok_or(NonFatalError::NoSuchSession(session_index))?;
.ok_or(Error::NoSuchSession(session_index))?;
if let Some(our_index) = self.get_our_index(validators).await {
// Get our group index:
@@ -245,8 +246,8 @@ impl SessionCache {
None
}
})
})
.expect("Every validator should be in a validator group. qed.");
}
);
// Shuffle validators in groups:
let mut rng = thread_rng();
@@ -274,9 +275,9 @@ impl SessionCache {
session_index,
our_group,
};
return Ok(Some(info));
return Ok(Some(info))
}
return Ok(None);
return Ok(None)
}
/// Get our `ValidatorIndex`.
@@ -19,14 +19,29 @@
use std::sync::Arc;
use sc_keystore::LocalKeystore;
use sp_keyring::Sr25519Keyring;
use sp_application_crypto::AppKey;
use polkadot_erasure_coding::{branches, obtain_chunks_v1 as obtain_chunks};
use polkadot_primitives::v1::{AvailableData, BlockData, CandidateCommitments, CandidateDescriptor,
CandidateHash, CommittedCandidateReceipt, ErasureChunk, GroupIndex, Hash, HeadData, Id
as ParaId, OccupiedCore, PersistedValidationData, PoV, SessionInfo,
ValidatorIndex
use polkadot_primitives::v1::{
AvailableData, BlockData, CandidateCommitments, CandidateDescriptor, CandidateHash,
CommittedCandidateReceipt, ErasureChunk, GroupIndex, Hash, HeadData, Id as ParaId,
OccupiedCore, PersistedValidationData, PoV, SessionInfo, ValidatorId, ValidatorIndex
};
use sp_keystore::{SyncCryptoStore, SyncCryptoStorePtr};
/// Get mock keystore with `Ferdie` key.
pub fn make_ferdie_keystore() -> SyncCryptoStorePtr {
let keystore: SyncCryptoStorePtr = Arc::new(LocalKeystore::in_memory());
SyncCryptoStore::sr25519_generate_new(
&*keystore,
ValidatorId::ID,
Some(&Sr25519Keyring::Ferdie.to_seed()),
)
.expect("Insert key into keystore");
keystore
}
/// Create dummy session info with two validator groups.
pub fn make_session_info() -> SessionInfo {
@@ -23,10 +23,7 @@ use smallvec::smallvec;
use futures::{FutureExt, channel::oneshot, SinkExt, channel::mpsc, StreamExt};
use futures_timer::Delay;
use sc_keystore::LocalKeystore;
use sp_application_crypto::AppKey;
use sp_keystore::{SyncCryptoStore, SyncCryptoStorePtr};
use sp_keyring::Sr25519Keyring;
use sp_keystore::SyncCryptoStorePtr;
use sp_core::{traits::SpawnNamed, testing::TaskExecutor};
use sc_network as network;
use sc_network::IfDisconnected;
@@ -39,7 +36,7 @@ use polkadot_subsystem::{ActiveLeavesUpdate, FromOverseer, OverseerSignal, Activ
}
};
use polkadot_primitives::v1::{CandidateHash, CoreState, ErasureChunk, GroupIndex, Hash, Id
as ParaId, ScheduledCore, SessionInfo, ValidatorId,
as ParaId, ScheduledCore, SessionInfo,
ValidatorIndex
};
use polkadot_node_network_protocol::{jaeger,
@@ -48,7 +45,7 @@ use polkadot_node_network_protocol::{jaeger,
use polkadot_subsystem_testhelpers as test_helpers;
use test_helpers::SingleItemSink;
use super::mock::{make_session_info, OccupiedCoreBuilder, };
use super::mock::{make_session_info, OccupiedCoreBuilder, make_ferdie_keystore};
use crate::LOG_TARGET;
pub struct TestHarness {
@@ -83,17 +80,10 @@ impl Default for TestState {
let chain_ids = vec![chain_a, chain_b];
let keystore: SyncCryptoStorePtr = Arc::new(LocalKeystore::in_memory());
let keystore = make_ferdie_keystore();
let session_info = make_session_info();
SyncCryptoStore::sr25519_generate_new(
&*keystore,
ValidatorId::ID,
Some(&Sr25519Keyring::Ferdie.to_seed()),
)
.expect("Insert key into keystore");
let (cores, chunks) = {
let mut cores = HashMap::new();
let mut chunks = HashMap::new();
@@ -163,6 +153,9 @@ impl TestState {
/// This will simply advance through the simulated chain and examines whether the subsystem
/// behaves as expected: It will succeed if all valid chunks of other backing groups get stored
/// and no other.
///
/// We try to be as agnostic about details as possible, how the subsystem achieves those goals
/// should not be a matter to this test suite.
async fn run_inner(self, executor: TaskExecutor, virtual_overseer: TestSubsystemContextHandle<AvailabilityDistributionMessage>) {
// We skip genesis here (in reality ActiveLeavesUpdate can also skip a block:
let updates = {
@@ -258,15 +251,12 @@ impl TestState {
}
}
_ => {
panic!("Unexpected message received: {:?}", msg);
}
}
}
}
}
async fn overseer_signal(
mut tx: SingleItemSink<FromOverseer<AvailabilityDistributionMessage>>,
msg: impl Into<OverseerSignal>,
+13 -24
View File
@@ -696,7 +696,6 @@ mod tests {
use polkadot_subsystem::messages::{
ApprovalDistributionMessage,
BitfieldDistributionMessage,
PoVDistributionMessage,
StatementDistributionMessage
};
use polkadot_node_subsystem_test_helpers::{
@@ -897,13 +896,6 @@ mod tests {
) if e == event.focus().expect("could not focus message")
);
assert_matches!(
virtual_overseer.recv().await,
AllMessages::PoVDistribution(
PoVDistributionMessage::NetworkBridgeUpdateV1(e)
) if e == event.focus().expect("could not focus message")
);
assert_matches!(
virtual_overseer.recv().await,
AllMessages::ApprovalDistribution(
@@ -1166,13 +1158,12 @@ mod tests {
).await;
}
let pov_distribution_message = protocol_v1::PoVDistributionMessage::Awaiting(
[0; 32].into(),
vec![[1; 32].into()],
let approval_distribution_message = protocol_v1::ApprovalDistributionMessage::Approvals(
Vec::new()
);
let message = protocol_v1::ValidationProtocol::PoVDistribution(
pov_distribution_message.clone(),
let message = protocol_v1::ValidationProtocol::ApprovalDistribution(
approval_distribution_message.clone(),
);
network_handle.peer_message(
@@ -1183,18 +1174,18 @@ mod tests {
network_handle.disconnect_peer(peer.clone(), PeerSet::Validation).await;
// PoV distribution message comes first, and the message is only sent to that subsystem.
// Approval distribution message comes first, and the message is only sent to that subsystem.
// then a disconnection event arises that is sent to all validation networking subsystems.
assert_matches!(
virtual_overseer.recv().await,
AllMessages::PoVDistribution(
PoVDistributionMessage::NetworkBridgeUpdateV1(
AllMessages::ApprovalDistribution(
ApprovalDistributionMessage::NetworkBridgeUpdateV1(
NetworkBridgeEvent::PeerMessage(p, m)
)
) => {
assert_eq!(p, peer);
assert_eq!(m, pov_distribution_message);
assert_eq!(m, approval_distribution_message);
}
);
@@ -1563,13 +1554,12 @@ mod tests {
// send a validation protocol message.
{
let pov_distribution_message = protocol_v1::PoVDistributionMessage::Awaiting(
[0; 32].into(),
vec![[1; 32].into()],
let approval_distribution_message = protocol_v1::ApprovalDistributionMessage::Approvals(
Vec::new()
);
let message = protocol_v1::ValidationProtocol::PoVDistribution(
pov_distribution_message.clone(),
let message = protocol_v1::ValidationProtocol::ApprovalDistribution(
approval_distribution_message.clone(),
);
virtual_overseer.send(FromOverseer::Communication {
@@ -1624,7 +1614,7 @@ mod tests {
fn spread_event_to_subsystems_is_up_to_date() {
// Number of subsystems expected to be interested in a network event,
// and hence the network event broadcasted to.
const EXPECTED_COUNT: usize = 4;
const EXPECTED_COUNT: usize = 3;
let mut cnt = 0_usize;
for msg in AllMessages::dispatch_iter(NetworkBridgeEvent::PeerDisconnected(PeerId::random())) {
@@ -1640,7 +1630,6 @@ mod tests {
AllMessages::BitfieldDistribution(_) => { cnt += 1; }
AllMessages::BitfieldSigning(_) => unreachable!("Not interested in network events"),
AllMessages::Provisioner(_) => unreachable!("Not interested in network events"),
AllMessages::PoVDistribution(_) => { cnt += 1; }
AllMessages::RuntimeApi(_) => unreachable!("Not interested in network events"),
AllMessages::AvailabilityStore(_) => unreachable!("Not interested in network events"),
AllMessages::NetworkBridge(_) => unreachable!("Not interested in network events"),
@@ -141,6 +141,11 @@ fn multiplex_single(
decode_with_peer::<v1::CollationFetchingRequest>(peer, payload)?,
pending_response,
)),
Protocol::PoVFetching => From::from(IncomingRequest::new(
peer,
decode_with_peer::<v1::PoVFetchingRequest>(peer, payload)?,
pending_response,
)),
Protocol::AvailableDataFetching => From::from(IncomingRequest::new(
peer,
decode_with_peer::<v1::AvailableDataFetchingRequest>(peer, payload)?,
+8 -8
View File
@@ -237,14 +237,14 @@ impl Network for Arc<NetworkService<Block, Hash>> {
Recipient::Peer(peer_id) => Some(peer_id),
Recipient::Authority(authority) =>
authority_discovery
.get_addresses_by_authority_id(authority)
.await
.and_then(|addrs| {
addrs
.into_iter()
.find_map(|addr| peer_id_from_multiaddr(&addr))
}),
};
.get_addresses_by_authority_id(authority)
.await
.and_then(|addrs| {
addrs
.into_iter()
.find_map(|addr| peer_id_from_multiaddr(&addr))
}),
};
let peer_id = match peer_id {
None => {
@@ -1,25 +0,0 @@
[package]
name = "polkadot-pov-distribution"
version = "0.1.0"
authors = ["Parity Technologies <admin@parity.io>"]
edition = "2018"
[dependencies]
futures = "0.3.12"
thiserror = "1.0.23"
tracing = "0.1.25"
polkadot-primitives = { path = "../../../primitives" }
polkadot-subsystem = { package = "polkadot-node-subsystem", path = "../../subsystem" }
polkadot-node-subsystem-util = { path = "../../subsystem-util" }
polkadot-node-network-protocol = { path = "../../network/protocol" }
[dev-dependencies]
assert_matches = "1.4.0"
env_logger = "0.8.1"
log = "0.4.13"
sp-core = { git = "https://github.com/paritytech/substrate", branch = "master" }
sp-keyring = { git = "https://github.com/paritytech/substrate", branch = "master" }
polkadot-node-subsystem-test-helpers = { path = "../../subsystem-test-helpers" }
@@ -1,33 +0,0 @@
// Copyright 2020 Parity Technologies (UK) Ltd.
// This file is part of Polkadot.
// Polkadot is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Polkadot is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
//! The `Error` and `Result` types used by the subsystem.
use thiserror::Error;
#[derive(Debug, Error)]
pub enum Error {
#[error(transparent)]
Subsystem(#[from] polkadot_subsystem::SubsystemError),
#[error(transparent)]
OneshotRecv(#[from] futures::channel::oneshot::Canceled),
#[error(transparent)]
Runtime(#[from] polkadot_subsystem::errors::RuntimeApiError),
#[error(transparent)]
Util(#[from] polkadot_node_subsystem_util::Error),
}
pub type Result<T> = std::result::Result<T, Error>;
@@ -1,996 +0,0 @@
// Copyright 2020 Parity Technologies (UK) Ltd.
// This file is part of Polkadot.
// Polkadot is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Polkadot is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
//! PoV Distribution Subsystem of Polkadot.
//!
//! This is a gossip implementation of code that is responsible for distributing PoVs
//! among validators.
#![deny(unused_crate_dependencies)]
#![warn(missing_docs)]
use polkadot_primitives::v1::{CandidateDescriptor, CompressedPoV, CoreIndex, CoreState, Hash, Id as ParaId, PoV, ValidatorId};
use polkadot_subsystem::{
ActiveLeavesUpdate, OverseerSignal, SubsystemContext, SubsystemResult, SubsystemError, Subsystem,
FromOverseer, SpawnedSubsystem,
jaeger,
messages::{
PoVDistributionMessage, AllMessages, NetworkBridgeMessage, NetworkBridgeEvent,
},
};
use polkadot_node_subsystem_util::{
validator_discovery,
request_validators_ctx,
request_validator_groups_ctx,
request_availability_cores_ctx,
metrics::{self, prometheus},
};
use polkadot_node_network_protocol::{
peer_set::PeerSet, v1 as protocol_v1, PeerId, OurView, UnifiedReputationChange as Rep,
};
use futures::prelude::*;
use futures::channel::oneshot;
use std::collections::{hash_map::{Entry, HashMap}, HashSet};
use std::sync::Arc;
mod error;
#[cfg(test)]
mod tests;
const COST_APPARENT_FLOOD: Rep = Rep::CostMajor("Peer appears to be flooding us with PoV requests");
const COST_UNEXPECTED_POV: Rep = Rep::CostMajor("Peer sent us an unexpected PoV");
const COST_AWAITED_NOT_IN_VIEW: Rep
= Rep::CostMinor("Peer claims to be awaiting something outside of its view");
const BENEFIT_FRESH_POV: Rep = Rep::BenefitMinorFirst("Peer supplied us with an awaited PoV");
const BENEFIT_LATE_POV: Rep = Rep::BenefitMinor("Peer supplied us with an awaited PoV, \
but was not the first to do so");
const LOG_TARGET: &str = "parachain::pov-distribution";
/// The PoV Distribution Subsystem.
pub struct PoVDistribution {
// Prometheus metrics
metrics: Metrics,
}
impl<C> Subsystem<C> for PoVDistribution
where C: SubsystemContext<Message = PoVDistributionMessage>
{
fn start(self, ctx: C) -> SpawnedSubsystem {
// Swallow error because failure is fatal to the node and we log with more precision
// within `run`.
let future = self.run(ctx)
.map_err(|e| SubsystemError::with_origin("pov-distribution", e))
.boxed();
SpawnedSubsystem {
name: "pov-distribution-subsystem",
future,
}
}
}
#[derive(Default)]
struct State {
/// A state of things going on on a per-relay-parent basis.
relay_parent_state: HashMap<Hash, BlockBasedState>,
/// Info on peers.
peer_state: HashMap<PeerId, PeerState>,
/// Our own view.
our_view: OurView,
/// Connect to relevant groups of validators at different relay parents.
connection_requests: validator_discovery::ConnectionRequests,
/// Metrics.
metrics: Metrics,
}
struct BlockBasedState {
known: HashMap<Hash, (Arc<PoV>, CompressedPoV)>,
/// All the PoVs we are or were fetching, coupled with channels expecting the data.
///
/// This may be an empty list, which indicates that we were once awaiting this PoV but have
/// received it already.
fetching: HashMap<Hash, Vec<oneshot::Sender<Arc<PoV>>>>,
n_validators: usize,
}
#[derive(Default)]
struct PeerState {
/// A set of awaited PoV-hashes for each relay-parent in the peer's view.
awaited: HashMap<Hash, HashSet<Hash>>,
}
fn awaiting_message(relay_parent: Hash, awaiting: Vec<Hash>)
-> protocol_v1::ValidationProtocol
{
protocol_v1::ValidationProtocol::PoVDistribution(
protocol_v1::PoVDistributionMessage::Awaiting(relay_parent, awaiting)
)
}
fn send_pov_message(
relay_parent: Hash,
pov_hash: Hash,
pov: &CompressedPoV,
) -> protocol_v1::ValidationProtocol {
protocol_v1::ValidationProtocol::PoVDistribution(
protocol_v1::PoVDistributionMessage::SendPoV(relay_parent, pov_hash, pov.clone())
)
}
/// Handles the signal. If successful, returns `true` if the subsystem should conclude,
/// `false` otherwise.
#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
async fn handle_signal(
state: &mut State,
ctx: &mut impl SubsystemContext<Message = PoVDistributionMessage>,
signal: OverseerSignal,
) -> SubsystemResult<bool> {
match signal {
OverseerSignal::Conclude => Ok(true),
OverseerSignal::ActiveLeaves(ActiveLeavesUpdate { activated, deactivated }) => {
let _timer = state.metrics.time_handle_signal();
for activated in activated {
let _span = activated.span.child("pov-dist")
.with_stage(jaeger::Stage::PoVDistribution);
let relay_parent = activated.hash;
match request_validators_ctx(relay_parent, ctx).await {
Ok(vals_rx) => {
let n_validators = match vals_rx.await? {
Ok(v) => v.len(),
Err(e) => {
tracing::warn!(
target: LOG_TARGET,
err = ?e,
"Error fetching validators from runtime API for active leaf",
);
// Not adding bookkeeping here might make us behave funny, but we
// shouldn't take down the node on spurious runtime API errors.
//
// and this is "behave funny" as in be bad at our job, but not in any
// slashable or security-related way.
continue;
}
};
state.relay_parent_state.insert(relay_parent, BlockBasedState {
known: HashMap::new(),
fetching: HashMap::new(),
n_validators,
});
}
Err(e) => {
// continue here also as above.
tracing::warn!(
target: LOG_TARGET,
err = ?e,
"Error fetching validators from runtime API for active leaf",
);
}
}
}
for relay_parent in deactivated {
state.connection_requests.remove_all(&relay_parent);
state.relay_parent_state.remove(&relay_parent);
}
Ok(false)
}
OverseerSignal::BlockFinalized(..) => Ok(false),
}
}
/// Notify peers that we are awaiting a given PoV hash.
///
/// This only notifies peers who have the relay parent in their view.
#[tracing::instrument(level = "trace", skip(peers, ctx), fields(subsystem = LOG_TARGET))]
async fn notify_all_we_are_awaiting(
peers: &mut HashMap<PeerId, PeerState>,
ctx: &mut impl SubsystemContext<Message = PoVDistributionMessage>,
relay_parent: Hash,
pov_hash: Hash,
) {
// We use `awaited` as a proxy for which heads are in the peer's view.
let peers_to_send: Vec<_> = peers.iter()
.filter_map(|(peer, state)| if state.awaited.contains_key(&relay_parent) {
Some(peer.clone())
} else {
None
})
.collect();
if peers_to_send.is_empty() {
return;
}
let payload = awaiting_message(relay_parent, vec![pov_hash]);
tracing::trace!(
target: LOG_TARGET,
peers = ?peers_to_send,
?relay_parent,
?pov_hash,
"Sending awaiting message",
);
ctx.send_message(AllMessages::NetworkBridge(NetworkBridgeMessage::SendValidationMessage(
peers_to_send,
payload,
))).await;
}
/// Notify one peer about everything we're awaiting at a given relay-parent.
#[tracing::instrument(level = "trace", skip(ctx, relay_parent_state), fields(subsystem = LOG_TARGET))]
async fn notify_one_we_are_awaiting_many(
peer: &PeerId,
ctx: &mut impl SubsystemContext<Message = PoVDistributionMessage>,
relay_parent_state: &HashMap<Hash, BlockBasedState>,
relay_parent: Hash,
) {
let awaiting_hashes = relay_parent_state.get(&relay_parent).into_iter().flat_map(|s| {
// Send the peer everything we are fetching at this relay-parent
s.fetching.iter()
.filter(|(_, senders)| !senders.is_empty()) // that has not been completed already.
.map(|(pov_hash, _)| *pov_hash)
}).collect::<Vec<_>>();
if awaiting_hashes.is_empty() {
return;
}
tracing::trace!(
target: LOG_TARGET,
?peer,
?relay_parent,
?awaiting_hashes,
"Sending awaiting message",
);
let payload = awaiting_message(relay_parent, awaiting_hashes);
ctx.send_message(AllMessages::NetworkBridge(NetworkBridgeMessage::SendValidationMessage(
vec![peer.clone()],
payload,
))).await;
}
/// Distribute a PoV to peers who are awaiting it.
#[tracing::instrument(level = "trace", skip(peers, ctx, metrics, pov), fields(subsystem = LOG_TARGET))]
async fn distribute_to_awaiting(
peers: &mut HashMap<PeerId, PeerState>,
ctx: &mut impl SubsystemContext<Message = PoVDistributionMessage>,
metrics: &Metrics,
relay_parent: Hash,
pov_hash: Hash,
pov: &CompressedPoV,
) {
// Send to all peers who are awaiting the PoV and have that relay-parent in their view.
//
// Also removes it from their awaiting set.
let peers_to_send: Vec<_> = peers.iter_mut()
.filter_map(|(peer, state)| state.awaited.get_mut(&relay_parent).and_then(|awaited| {
if awaited.remove(&pov_hash) {
Some(peer.clone())
} else {
None
}
}))
.collect();
tracing::trace!(
target: LOG_TARGET,
peers = ?peers_to_send,
?relay_parent,
?pov_hash,
"Sending PoV message",
);
if peers_to_send.is_empty() { return; }
let payload = send_pov_message(relay_parent, pov_hash, pov);
ctx.send_message(AllMessages::NetworkBridge(NetworkBridgeMessage::SendValidationMessage(
peers_to_send,
payload,
))).await;
metrics.on_pov_distributed();
}
/// Connect to relevant validators in case we are not already.
async fn connect_to_relevant_validators(
connection_requests: &mut validator_discovery::ConnectionRequests,
ctx: &mut impl SubsystemContext<Message = PoVDistributionMessage>,
relay_parent: Hash,
descriptor: &CandidateDescriptor,
) {
let para_id = descriptor.para_id;
if let Ok(Some(relevant_validators)) =
determine_relevant_validators(ctx, relay_parent, para_id).await
{
// We only need one connection request per (relay_parent, para_id)
// so here we take this shortcut to avoid calling `connect_to_validators`
// more than once.
if !connection_requests.contains_request(&relay_parent, para_id) {
tracing::debug!(
target: LOG_TARGET,
validators=?relevant_validators,
?relay_parent,
"connecting to validators"
);
match validator_discovery::connect_to_validators(
ctx,
relay_parent,
relevant_validators,
PeerSet::Validation,
).await {
Ok(new_connection_request) => {
connection_requests.put(relay_parent, para_id, new_connection_request);
}
Err(e) => {
tracing::debug!(
target: LOG_TARGET,
"Failed to create a validator connection request {:?}",
e,
);
}
}
}
}
}
/// Get the Id of the Core that is assigned to the para being collated on if any
/// and the total number of cores.
async fn determine_core(
ctx: &mut impl SubsystemContext<Message = PoVDistributionMessage>,
para_id: ParaId,
relay_parent: Hash,
) -> error::Result<Option<(CoreIndex, usize)>> {
let cores = request_availability_cores_ctx(relay_parent, ctx).await?.await??;
for (idx, core) in cores.iter().enumerate() {
if let CoreState::Scheduled(occupied) = core {
if occupied.para_id == para_id {
return Ok(Some(((idx as u32).into(), cores.len())));
}
}
}
Ok(None)
}
/// Figure out a group of validators assigned to a given `ParaId`.
async fn determine_validators_for_core(
ctx: &mut impl SubsystemContext<Message = PoVDistributionMessage>,
core_index: CoreIndex,
num_cores: usize,
relay_parent: Hash,
) -> error::Result<Option<Vec<ValidatorId>>> {
let groups = request_validator_groups_ctx(relay_parent, ctx).await?.await??;
let group_index = groups.1.group_for_core(core_index, num_cores);
let connect_to_validators = match groups.0.get(group_index.0 as usize) {
Some(group) => group.clone(),
None => return Ok(None),
};
let validators = request_validators_ctx(relay_parent, ctx).await?.await??;
let validators = connect_to_validators
.into_iter()
.map(|idx| validators[idx.0 as usize].clone())
.collect();
Ok(Some(validators))
}
async fn determine_relevant_validators(
ctx: &mut impl SubsystemContext<Message = PoVDistributionMessage>,
relay_parent: Hash,
para_id: ParaId,
) -> error::Result<Option<Vec<ValidatorId>>> {
// Determine which core the para_id is assigned to.
let (core, num_cores) = match determine_core(ctx, para_id, relay_parent).await? {
Some(core) => core,
None => {
tracing::warn!(
target: LOG_TARGET,
"Looks like no core is assigned to {:?} at {:?}",
para_id,
relay_parent,
);
return Ok(None);
}
};
determine_validators_for_core(ctx, core, num_cores, relay_parent).await
}
/// Handles a `FetchPoV` message.
#[tracing::instrument(level = "trace", skip(ctx, state, response_sender), fields(subsystem = LOG_TARGET))]
async fn handle_fetch(
state: &mut State,
ctx: &mut impl SubsystemContext<Message = PoVDistributionMessage>,
relay_parent: Hash,
descriptor: CandidateDescriptor,
response_sender: oneshot::Sender<Arc<PoV>>,
) {
let _timer = state.metrics.time_handle_fetch();
let relay_parent_state = match state.relay_parent_state.get_mut(&relay_parent) {
Some(s) => s,
None => return,
};
if let Some((pov, _)) = relay_parent_state.known.get(&descriptor.pov_hash) {
let _ = response_sender.send(pov.clone());
return;
}
{
match relay_parent_state.fetching.entry(descriptor.pov_hash) {
Entry::Occupied(mut e) => {
// we are already awaiting this PoV if there is an entry.
e.get_mut().push(response_sender);
return;
}
Entry::Vacant(e) => {
connect_to_relevant_validators(&mut state.connection_requests, ctx, relay_parent, &descriptor).await;
e.insert(vec![response_sender]);
}
}
}
if relay_parent_state.fetching.len() > 2 * relay_parent_state.n_validators {
tracing::warn!(
target = LOG_TARGET,
relay_parent_state.fetching.len = relay_parent_state.fetching.len(),
"other subsystems have requested PoV distribution to fetch more PoVs than reasonably expected",
);
return;
}
// Issue an `Awaiting` message to all peers with this in their view.
notify_all_we_are_awaiting(
&mut state.peer_state,
ctx,
relay_parent,
descriptor.pov_hash
).await
}
/// Handles a `DistributePoV` message.
#[tracing::instrument(level = "trace", skip(ctx, state, pov), fields(subsystem = LOG_TARGET))]
async fn handle_distribute(
state: &mut State,
ctx: &mut impl SubsystemContext<Message = PoVDistributionMessage>,
relay_parent: Hash,
descriptor: CandidateDescriptor,
pov: Arc<PoV>,
) {
let _timer = state.metrics.time_handle_distribute();
let relay_parent_state = match state.relay_parent_state.get_mut(&relay_parent) {
Some(s) => s,
None => return,
};
connect_to_relevant_validators(&mut state.connection_requests, ctx, relay_parent, &descriptor).await;
if let Some(our_awaited) = relay_parent_state.fetching.get_mut(&descriptor.pov_hash) {
// Drain all the senders, but keep the entry in the map around intentionally.
//
// It signals that we were at one point awaiting this, so we will be able to tell
// why peers are sending it to us.
for response_sender in our_awaited.drain(..) {
let _ = response_sender.send(pov.clone());
}
}
let encoded_pov = match CompressedPoV::compress(&*pov) {
Ok(pov) => pov,
Err(error) => {
tracing::debug!(
target: LOG_TARGET,
error = ?error,
"Failed to create `CompressedPov`."
);
return
}
};
distribute_to_awaiting(
&mut state.peer_state,
ctx,
&state.metrics,
relay_parent,
descriptor.pov_hash,
&encoded_pov,
).await;
relay_parent_state.known.insert(descriptor.pov_hash, (pov, encoded_pov));
}
/// Report a reputation change for a peer.
#[tracing::instrument(level = "trace", skip(ctx), fields(subsystem = LOG_TARGET))]
async fn report_peer(
ctx: &mut impl SubsystemContext<Message = PoVDistributionMessage>,
peer: PeerId,
rep: Rep,
) {
ctx.send_message(AllMessages::NetworkBridge(NetworkBridgeMessage::ReportPeer(peer, rep))).await
}
/// Handle a notification from a peer that they are awaiting some PoVs.
#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
async fn handle_awaiting(
state: &mut State,
ctx: &mut impl SubsystemContext<Message = PoVDistributionMessage>,
peer: PeerId,
relay_parent: Hash,
pov_hashes: Vec<Hash>,
) {
if !state.our_view.contains(&relay_parent) {
tracing::trace!(
target: LOG_TARGET,
?peer,
?relay_parent,
?pov_hashes,
"Received awaiting message for unknown block",
);
report_peer(ctx, peer, COST_AWAITED_NOT_IN_VIEW).await;
return;
}
tracing::trace!(
target: LOG_TARGET,
?peer,
?relay_parent,
?pov_hashes,
"Received awaiting message",
);
let relay_parent_state = match state.relay_parent_state.get_mut(&relay_parent) {
None => {
tracing::warn!(
target: LOG_TARGET,
?peer,
?relay_parent,
"PoV Distribution relay parent state out-of-sync with our view"
);
return;
}
Some(s) => s,
};
let peer_awaiting = match
state.peer_state.get_mut(&peer).and_then(|s| s.awaited.get_mut(&relay_parent))
{
None => {
report_peer(ctx, peer, COST_AWAITED_NOT_IN_VIEW).await;
return;
}
Some(a) => a,
};
let will_be_awaited = peer_awaiting.len() + pov_hashes.len();
if will_be_awaited <= 2 * relay_parent_state.n_validators {
for pov_hash in pov_hashes {
// For all requested PoV hashes, if we have it, we complete the request immediately.
// Otherwise, we note that the peer is awaiting the PoV.
if let Some((_, ref pov)) = relay_parent_state.known.get(&pov_hash) {
tracing::trace!(
target: LOG_TARGET,
?peer,
?relay_parent,
?pov_hash,
"Sending awaited PoV message",
);
let payload = send_pov_message(relay_parent, pov_hash, pov);
ctx.send_message(AllMessages::NetworkBridge(
NetworkBridgeMessage::SendValidationMessage(vec![peer.clone()], payload)
)).await;
} else {
peer_awaiting.insert(pov_hash);
}
}
} else {
tracing::debug!(
target: LOG_TARGET,
?peer,
?relay_parent,
"Too many PoV requests",
);
report_peer(ctx, peer, COST_APPARENT_FLOOD).await;
}
}
/// Handle an incoming PoV from our peer. Reports them if unexpected, rewards them if not.
///
/// Completes any requests awaiting that PoV.
#[tracing::instrument(level = "trace", skip(ctx, state, encoded_pov), fields(subsystem = LOG_TARGET))]
async fn handle_incoming_pov(
state: &mut State,
ctx: &mut impl SubsystemContext<Message = PoVDistributionMessage>,
peer: PeerId,
relay_parent: Hash,
pov_hash: Hash,
encoded_pov: CompressedPoV,
) {
let relay_parent_state = match state.relay_parent_state.get_mut(&relay_parent) {
None => {
tracing::debug!(
target: LOG_TARGET,
?peer,
?relay_parent,
?pov_hash,
"Unexpected PoV",
);
report_peer(ctx, peer, COST_UNEXPECTED_POV).await;
return;
},
Some(r) => r,
};
let pov = match encoded_pov.decompress() {
Ok(pov) => pov,
Err(error) => {
tracing::debug!(
target: LOG_TARGET,
error = ?error,
"Could not extract PoV",
);
return;
}
};
let pov = {
// Do validity checks and complete all senders awaiting this PoV.
let fetching = match relay_parent_state.fetching.get_mut(&pov_hash) {
None => {
tracing::debug!(
target: LOG_TARGET,
?peer,
?relay_parent,
?pov_hash,
"Unexpected PoV",
);
report_peer(ctx, peer, COST_UNEXPECTED_POV).await;
return;
}
Some(f) => f,
};
let hash = pov.hash();
if hash != pov_hash {
tracing::debug!(
target: LOG_TARGET,
?peer,
?relay_parent,
?pov_hash,
?hash,
"Mismatched PoV",
);
report_peer(ctx, peer, COST_UNEXPECTED_POV).await;
return;
}
let pov = Arc::new(pov);
if fetching.is_empty() {
// fetching is empty whenever we were awaiting something and
// it was completed afterwards.
report_peer(ctx, peer.clone(), BENEFIT_LATE_POV).await;
} else {
// fetching is non-empty when the peer just provided us with data we needed.
report_peer(ctx, peer.clone(), BENEFIT_FRESH_POV).await;
}
for response_sender in fetching.drain(..) {
let _ = response_sender.send(pov.clone());
}
pov
};
tracing::debug!(
target: LOG_TARGET,
?peer,
?relay_parent,
?pov_hash,
"Received PoV",
);
// make sure we don't consider this peer as awaiting that PoV anymore.
if let Some(peer_state) = state.peer_state.get_mut(&peer) {
peer_state.awaited.remove(&pov_hash);
}
// distribute the PoV to all other peers who are awaiting it.
distribute_to_awaiting(
&mut state.peer_state,
ctx,
&state.metrics,
relay_parent,
pov_hash,
&encoded_pov,
).await;
relay_parent_state.known.insert(pov_hash, (pov, encoded_pov));
}
/// Handles a newly or already connected validator in the context of some relay leaf.
fn handle_validator_connected(state: &mut State, peer_id: PeerId) {
state.peer_state.entry(peer_id).or_default();
}
/// Handles a network bridge update.
#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))]
async fn handle_network_update(
state: &mut State,
ctx: &mut impl SubsystemContext<Message = PoVDistributionMessage>,
update: NetworkBridgeEvent<protocol_v1::PoVDistributionMessage>,
) {
let _timer = state.metrics.time_handle_network_update();
match update {
NetworkBridgeEvent::PeerConnected(peer, role) => {
tracing::trace!(
target: LOG_TARGET,
?peer,
?role,
"Peer connected",
);
handle_validator_connected(state, peer);
}
NetworkBridgeEvent::PeerDisconnected(peer) => {
tracing::trace!(
target: LOG_TARGET,
?peer,
"Peer disconnected",
);
state.peer_state.remove(&peer);
}
NetworkBridgeEvent::PeerViewChange(peer_id, view) => {
tracing::trace!(
target: LOG_TARGET,
?peer_id,
?view,
"Peer view change",
);
if let Some(peer_state) = state.peer_state.get_mut(&peer_id) {
// prune anything not in the new view.
peer_state.awaited.retain(|relay_parent, _| view.contains(&relay_parent));
// introduce things from the new view.
for relay_parent in view.iter() {
if let Entry::Vacant(entry) = peer_state.awaited.entry(*relay_parent) {
entry.insert(HashSet::new());
// Notify the peer about everything we're awaiting at the new relay-parent.
notify_one_we_are_awaiting_many(
&peer_id,
ctx,
&state.relay_parent_state,
*relay_parent,
).await;
}
}
}
}
NetworkBridgeEvent::PeerMessage(peer, message) => {
match message {
protocol_v1::PoVDistributionMessage::Awaiting(relay_parent, pov_hashes)
=> handle_awaiting(
state,
ctx,
peer,
relay_parent,
pov_hashes,
).await,
protocol_v1::PoVDistributionMessage::SendPoV(relay_parent, pov_hash, pov)
=> handle_incoming_pov(
state,
ctx,
peer,
relay_parent,
pov_hash,
pov,
).await,
}
}
NetworkBridgeEvent::OurViewChange(view) => {
tracing::trace!(
target: LOG_TARGET,
"Own view change",
);
state.our_view = view;
}
}
}
impl PoVDistribution {
/// Create a new instance of `PovDistribution`.
pub fn new(metrics: Metrics) -> Self {
Self { metrics }
}
#[tracing::instrument(skip(self, ctx), fields(subsystem = LOG_TARGET))]
async fn run(
self,
ctx: impl SubsystemContext<Message = PoVDistributionMessage>,
) -> SubsystemResult<()> {
self.run_with_state(ctx, State::default()).await
}
async fn run_with_state(
self,
mut ctx: impl SubsystemContext<Message = PoVDistributionMessage>,
mut state: State,
) -> SubsystemResult<()> {
state.metrics = self.metrics;
loop {
// `select_biased` is used since receiving connection notifications and
// peer view update messages may be racy and we want connection notifications
// first.
futures::select_biased! {
v = state.connection_requests.next().fuse() => handle_validator_connected(&mut state, v.peer_id),
v = ctx.recv().fuse() => {
match v? {
FromOverseer::Signal(signal) => if handle_signal(
&mut state,
&mut ctx,
signal,
).await? {
return Ok(());
}
FromOverseer::Communication { msg } => match msg {
PoVDistributionMessage::FetchPoV(relay_parent, descriptor, response_sender) =>
handle_fetch(
&mut state,
&mut ctx,
relay_parent,
descriptor,
response_sender,
).await,
PoVDistributionMessage::DistributePoV(relay_parent, descriptor, pov) =>
handle_distribute(
&mut state,
&mut ctx,
relay_parent,
descriptor,
pov,
).await,
PoVDistributionMessage::NetworkBridgeUpdateV1(event) =>
handle_network_update(
&mut state,
&mut ctx,
event,
).await,
}
}
}
}
}
}
}
#[derive(Clone)]
struct MetricsInner {
povs_distributed: prometheus::Counter<prometheus::U64>,
handle_signal: prometheus::Histogram,
handle_fetch: prometheus::Histogram,
handle_distribute: prometheus::Histogram,
handle_network_update: prometheus::Histogram,
}
/// Availability Distribution metrics.
#[derive(Default, Clone)]
pub struct Metrics(Option<MetricsInner>);
impl Metrics {
fn on_pov_distributed(&self) {
if let Some(metrics) = &self.0 {
metrics.povs_distributed.inc();
}
}
/// Provide a timer for `handle_signal` which observes on drop.
fn time_handle_signal(&self) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.handle_signal.start_timer())
}
/// Provide a timer for `handle_fetch` which observes on drop.
fn time_handle_fetch(&self) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.handle_fetch.start_timer())
}
/// Provide a timer for `handle_distribute` which observes on drop.
fn time_handle_distribute(&self) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.handle_distribute.start_timer())
}
/// Provide a timer for `handle_network_update` which observes on drop.
fn time_handle_network_update(&self) -> Option<metrics::prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.handle_network_update.start_timer())
}
}
impl metrics::Metrics for Metrics {
fn try_register(registry: &prometheus::Registry) -> std::result::Result<Self, prometheus::PrometheusError> {
let metrics = MetricsInner {
povs_distributed: prometheus::register(
prometheus::Counter::new(
"parachain_povs_distributed_total",
"Number of PoVs distributed to other peers."
)?,
registry,
)?,
handle_signal: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"parachain_pov_distribution_handle_signal",
"Time spent within `pov_distribution::handle_signal`",
)
)?,
registry,
)?,
handle_fetch: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"parachain_pov_distribution_handle_fetch",
"Time spent within `pov_distribution::handle_fetch`",
)
)?,
registry,
)?,
handle_distribute: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"parachain_pov_distribution_handle_distribute",
"Time spent within `pov_distribution::handle_distribute`",
)
)?,
registry,
)?,
handle_network_update: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"parachain_pov_distribution_handle_network_update",
"Time spent within `pov_distribution::handle_network_update`",
)
)?,
registry,
)?,
};
Ok(Metrics(Some(metrics)))
}
}
File diff suppressed because it is too large Load Diff
@@ -13,3 +13,4 @@ parity-scale-codec = { version = "2.0.0", default-features = false, features = [
sc-network = { git = "https://github.com/paritytech/substrate", branch = "master" }
strum = { version = "0.20", features = ["derive"] }
futures = "0.3.12"
thiserror = "1.0.23"
+1 -18
View File
@@ -289,7 +289,7 @@ pub mod v1 {
use std::convert::TryFrom;
use polkadot_primitives::v1::{
CandidateIndex, CollatorId, CompressedPoV, Hash, Id as ParaId, SignedAvailabilityBitfield,
CandidateIndex, CollatorId, Hash, Id as ParaId, SignedAvailabilityBitfield,
CollatorSignature,
};
use polkadot_node_primitives::{
@@ -305,19 +305,6 @@ pub mod v1 {
Bitfield(Hash, SignedAvailabilityBitfield),
}
/// Network messages used by the PoV distribution subsystem.
#[derive(Debug, Clone, Encode, Decode, PartialEq, Eq)]
pub enum PoVDistributionMessage {
/// Notification that we are awaiting the given PoVs (by hash) against a
/// specific relay-parent hash.
#[codec(index = 0)]
Awaiting(Hash, Vec<Hash>),
/// Notification of an awaited PoV, in a given relay-parent context.
/// (relay_parent, pov_hash, compressed_pov)
#[codec(index = 1)]
SendPoV(Hash, Hash, CompressedPoV),
}
/// Network messages used by the statement distribution subsystem.
#[derive(Debug, Clone, Encode, Decode, PartialEq, Eq)]
pub enum StatementDistributionMessage {
@@ -361,9 +348,6 @@ pub mod v1 {
/// Bitfield distribution messages
#[codec(index = 1)]
BitfieldDistribution(BitfieldDistributionMessage),
/// PoV Distribution messages
#[codec(index = 2)]
PoVDistribution(PoVDistributionMessage),
/// Statement distribution messages
#[codec(index = 3)]
StatementDistribution(StatementDistributionMessage),
@@ -373,7 +357,6 @@ pub mod v1 {
}
impl_try_from!(ValidationProtocol, BitfieldDistribution, BitfieldDistributionMessage);
impl_try_from!(ValidationProtocol, PoVDistribution, PoVDistributionMessage);
impl_try_from!(ValidationProtocol, StatementDistribution, StatementDistributionMessage);
impl_try_from!(ValidationProtocol, ApprovalDistribution, ApprovalDistributionMessage);
@@ -60,6 +60,8 @@ pub enum Protocol {
ChunkFetching,
/// Protocol for fetching collations from collators.
CollationFetching,
/// Protocol for fetching seconded PoVs from validators of the same group.
PoVFetching,
/// Protocol for fetching available data.
AvailableDataFetching,
}
@@ -107,11 +109,18 @@ impl Protocol {
request_timeout: DEFAULT_REQUEST_TIMEOUT_CONNECTED,
inbound_queue: Some(tx),
},
Protocol::PoVFetching => RequestResponseConfig {
name: p_name,
max_request_size: 1_000,
max_response_size: MAX_COMPRESSED_POV_SIZE as u64,
request_timeout: DEFAULT_REQUEST_TIMEOUT_CONNECTED,
inbound_queue: Some(tx),
},
Protocol::AvailableDataFetching => RequestResponseConfig {
name: p_name,
max_request_size: 1_000,
// Available data size is dominated by the PoV size.
max_response_size: 30_000_000,
max_response_size: MAX_COMPRESSED_POV_SIZE as u64,
request_timeout: DEFAULT_REQUEST_TIMEOUT,
inbound_queue: Some(tx),
},
@@ -130,6 +139,8 @@ impl Protocol {
Protocol::ChunkFetching => 100,
// 10 seems reasonable, considering group sizes of max 10 validators.
Protocol::CollationFetching => 10,
// 10 seems reasonable, considering group sizes of max 10 validators.
Protocol::PoVFetching => 10,
// Validators are constantly self-selecting to request available data which may lead
// to constant load and occasional burstiness.
Protocol::AvailableDataFetching => 100,
@@ -146,6 +157,7 @@ impl Protocol {
match self {
Protocol::ChunkFetching => "/polkadot/req_chunk/1",
Protocol::CollationFetching => "/polkadot/req_collation/1",
Protocol::PoVFetching => "/polkadot/req_pov/1",
Protocol::AvailableDataFetching => "/polkadot/req_available_data/1",
}
}
@@ -17,6 +17,7 @@
use futures::channel::oneshot;
use futures::prelude::Future;
use thiserror::Error;
use parity_scale_codec::{Decode, Encode, Error as DecodingError};
use sc_network as network;
use sc_network::config as netconfig;
@@ -42,6 +43,8 @@ pub enum Requests {
ChunkFetching(OutgoingRequest<v1::ChunkFetchingRequest>),
/// Fetch a collation from a collator which previously announced it.
CollationFetching(OutgoingRequest<v1::CollationFetchingRequest>),
/// Fetch a PoV from a validator which previously sent out a seconded statement.
PoVFetching(OutgoingRequest<v1::PoVFetchingRequest>),
/// Request full available data from a node.
AvailableDataFetching(OutgoingRequest<v1::AvailableDataFetchingRequest>),
}
@@ -52,6 +55,7 @@ impl Requests {
match self {
Self::ChunkFetching(_) => Protocol::ChunkFetching,
Self::CollationFetching(_) => Protocol::CollationFetching,
Self::PoVFetching(_) => Protocol::PoVFetching,
Self::AvailableDataFetching(_) => Protocol::AvailableDataFetching,
}
}
@@ -67,6 +71,7 @@ impl Requests {
match self {
Self::ChunkFetching(r) => r.encode_request(),
Self::CollationFetching(r) => r.encode_request(),
Self::PoVFetching(r) => r.encode_request(),
Self::AvailableDataFetching(r) => r.encode_request(),
}
}
@@ -96,16 +101,19 @@ pub struct OutgoingRequest<Req> {
}
/// Any error that can occur when sending a request.
#[derive(Debug)]
#[derive(Debug, Error)]
pub enum RequestError {
/// Response could not be decoded.
InvalidResponse(DecodingError),
#[error("Response could not be decoded")]
InvalidResponse(#[source] DecodingError),
/// Some error in substrate/libp2p happened.
NetworkError(network::RequestFailure),
#[error("Some network error occurred")]
NetworkError(#[source] network::RequestFailure),
/// Response got canceled by networking.
Canceled(oneshot::Canceled),
#[error("Response channel got canceled")]
Canceled(#[source] oneshot::Canceled),
}
/// Responses received for an `OutgoingRequest`.
@@ -114,6 +114,29 @@ impl IsRequest for CollationFetchingRequest {
const PROTOCOL: Protocol = Protocol::CollationFetching;
}
/// Request the advertised collation at that relay-parent.
#[derive(Debug, Clone, Encode, Decode)]
pub struct PoVFetchingRequest {
/// Candidate we want a PoV for.
pub candidate_hash: CandidateHash,
}
/// Responses to `PoVFetchingRequest`.
#[derive(Debug, Clone, Encode, Decode)]
pub enum PoVFetchingResponse {
/// Deliver requested PoV.
#[codec(index = 0)]
PoV(CompressedPoV),
/// PoV was not found in store.
#[codec(index = 1)]
NoSuchPoV,
}
impl IsRequest for PoVFetchingRequest {
type Response = PoVFetchingResponse;
const PROTOCOL: Protocol = Protocol::PoVFetching;
}
/// Request the entire available data for a candidate.
#[derive(Debug, Clone, Encode, Decode)]
pub struct AvailableDataFetchingRequest {