Request based PoV distribution (#2640)

* Indentation fix.

* Prepare request-response for PoV fetching.

* Drop old PoV distribution.

* WIP: Fetch PoV directly from backing.

* Backing compiles.

* Runtime access and connection management for PoV distribution.

* Get rid of seemingly dead code.

* Implement PoV fetching.

Backing does not yet use it.

* Don't send `ConnectToValidators` for empty list.

* Even better - no need to check over and over again.

* PoV fetching implemented.

+ Typechecks
+ Should work

Missing:

- Guide
- Tests
- Do fallback fetching in case fetching from seconding validator fails.

* Check PoV hash upon reception.

* Implement retry of PoV fetching in backing.

* Avoid pointless validation spawning.

* Add jaeger span to pov requesting.

* Add back tracing.

* Review remarks.

* Whitespace.

* Whitespace again.

* Cleanup + fix tests.

* Log to log target in overseer.

* Fix more tests.

* Don't fail if group cannot be found.

* Simple test for PoV fetcher.

* Handle missing group membership better.

* Add test for retry functionality.

* Fix flaky test.

* Spaces again.

* Guide updates.

* Spaces.
This commit is contained in:
Robert Klotzner
2021-03-28 17:11:38 +02:00
committed by GitHub
parent 27b6d83974
commit c6f07d8f31
35 changed files with 1382 additions and 3184 deletions
@@ -19,7 +19,7 @@
use futures::channel::oneshot;
use polkadot_node_network_protocol::request_response::{request::IncomingRequest, v1};
use polkadot_primitives::v1::{CandidateHash, ErasureChunk, ValidatorIndex};
use polkadot_primitives::v1::{AvailableData, CandidateHash, CompressedPoV, ErasureChunk, ValidatorIndex};
use polkadot_subsystem::{
messages::{AllMessages, AvailabilityStoreMessage},
SubsystemContext, jaeger,
@@ -28,10 +28,36 @@ use polkadot_subsystem::{
use crate::error::{Error, Result};
use crate::{LOG_TARGET, metrics::{Metrics, SUCCEEDED, FAILED, NOT_FOUND}};
/// Variant of `answer_request` that does Prometheus metric and logging on errors.
/// Variant of `answer_pov_request` that does Prometheus metric and logging on errors.
///
/// Any errors of `answer_pov_request` will simply be logged.
pub async fn answer_pov_request_log<Context>(
ctx: &mut Context,
req: IncomingRequest<v1::PoVFetchingRequest>,
metrics: &Metrics,
)
where
Context: SubsystemContext,
{
let res = answer_pov_request(ctx, req).await;
match res {
Ok(result) =>
metrics.on_served_pov(if result {SUCCEEDED} else {NOT_FOUND}),
Err(err) => {
tracing::warn!(
target: LOG_TARGET,
err= ?err,
"Serving PoV failed with error"
);
metrics.on_served_pov(FAILED);
}
}
}
/// Variant of `answer_chunk_request` that does Prometheus metric and logging on errors.
///
/// Any errors of `answer_request` will simply be logged.
pub async fn answer_request_log<Context>(
pub async fn answer_chunk_request_log<Context>(
ctx: &mut Context,
req: IncomingRequest<v1::ChunkFetchingRequest>,
metrics: &Metrics,
@@ -39,33 +65,71 @@ pub async fn answer_request_log<Context>(
where
Context: SubsystemContext,
{
let res = answer_request(ctx, req).await;
let res = answer_chunk_request(ctx, req).await;
match res {
Ok(result) =>
metrics.on_served(if result {SUCCEEDED} else {NOT_FOUND}),
metrics.on_served_chunk(if result {SUCCEEDED} else {NOT_FOUND}),
Err(err) => {
tracing::warn!(
target: LOG_TARGET,
err= ?err,
"Serving chunk failed with error"
);
metrics.on_served(FAILED);
metrics.on_served_chunk(FAILED);
}
}
}
/// Answer an incoming PoV fetch request by querying the av store.
///
/// Returns: Ok(true) if chunk was found and served.
pub async fn answer_pov_request<Context>(
ctx: &mut Context,
req: IncomingRequest<v1::PoVFetchingRequest>,
) -> Result<bool>
where
Context: SubsystemContext,
{
let _span = jaeger::Span::new(req.payload.candidate_hash, "answer-pov-request");
let av_data = query_available_data(ctx, req.payload.candidate_hash).await?;
let result = av_data.is_some();
let response = match av_data {
None => v1::PoVFetchingResponse::NoSuchPoV,
Some(av_data) => {
let pov = match CompressedPoV::compress(&av_data.pov) {
Ok(pov) => pov,
Err(error) => {
tracing::error!(
target: LOG_TARGET,
error = ?error,
"Failed to create `CompressedPov`",
);
// this should really not happen, let this request time out:
return Err(Error::PoVDecompression(error))
}
};
v1::PoVFetchingResponse::PoV(pov)
}
};
req.send_response(response).map_err(|_| Error::SendResponse)?;
Ok(result)
}
/// Answer an incoming chunk request by querying the av store.
///
/// Returns: Ok(true) if chunk was found and served.
pub async fn answer_request<Context>(
pub async fn answer_chunk_request<Context>(
ctx: &mut Context,
req: IncomingRequest<v1::ChunkFetchingRequest>,
) -> Result<bool>
where
Context: SubsystemContext,
{
let span = jaeger::Span::new(req.payload.candidate_hash, "answer-request")
.with_stage(jaeger::Stage::AvailabilityDistribution);
let span = jaeger::Span::new(req.payload.candidate_hash, "answer-chunk-request");
let _child_span = span.child("answer-chunk-request")
.with_chunk_index(req.payload.index.0);
@@ -119,3 +183,21 @@ where
Error::QueryChunkResponseChannel(e)
})
}
/// Query PoV from the availability store.
#[tracing::instrument(level = "trace", skip(ctx), fields(subsystem = LOG_TARGET))]
async fn query_available_data<Context>(
ctx: &mut Context,
candidate_hash: CandidateHash,
) -> Result<Option<AvailableData>>
where
Context: SubsystemContext,
{
let (tx, rx) = oneshot::channel();
ctx.send_message(AllMessages::AvailabilityStore(
AvailabilityStoreMessage::QueryAvailableData(candidate_hash, tx),
))
.await;
rx.await.map_err(|e| Error::QueryAvailableDataResponseChannel(e))
}