fatality based errors (#4448)

* seed commit for fatality based errors

* fatality

* first draft of fatality

* cleanup

* differnt approach

* simplify

* first working version for enums, with documentation

* add split

* fix simple split test case

* extend README.md

* update fatality impl

* make tests passed

* apply fatality to first subsystem

* fatality fixes

* use fatality in a subsystem

* fix subsystemg

* fixup proc macro

* fix/test: log::*! do not execute when log handler is missing

* fix spelling

* rename Runtime2 to something sane

* allow nested split with `forward` annotations

* add free license

* enable and fixup all tests

* use external fatality

Makes this more reviewable.

* bump fatality dep

Avoid duplicate expander compilations.

* migrate availability distribution

* more fatality usage

* chore: bump fatality to 0.0.6

* fixup remaining subsystems

* chore: fmt

* make cargo spellcheck happy

* remove single instance of `#[fatal(false)]`

* last quality sweep

* fixup
This commit is contained in:
Bernhard Schuster
2022-02-25 18:25:26 +01:00
committed by GitHub
parent 85fa087405
commit d946582707
48 changed files with 425 additions and 659 deletions
@@ -17,9 +17,9 @@
//! Error handling related code and Error/Result definitions.
use fatality::Nested;
use polkadot_node_network_protocol::request_response::outgoing::RequestError;
use polkadot_primitives::v1::SessionIndex;
use thiserror::Error;
use futures::channel::oneshot;
@@ -28,116 +28,86 @@ use polkadot_subsystem::{ChainApiError, SubsystemError};
use crate::LOG_TARGET;
#[derive(Debug, Error, derive_more::From)]
#[error(transparent)]
#[allow(missing_docs)]
#[fatality::fatality(splitable)]
pub enum Error {
/// All fatal errors.
Fatal(Fatal),
/// All nonfatal/potentially recoverable errors.
NonFatal(NonFatal),
}
impl From<runtime::Error> for Error {
fn from(o: runtime::Error) -> Self {
match o {
runtime::Error::Fatal(f) => Self::Fatal(Fatal::Runtime(f)),
runtime::Error::NonFatal(f) => Self::NonFatal(NonFatal::Runtime(f)),
}
}
}
/// Fatal errors of this subsystem.
#[derive(Debug, Error)]
pub enum Fatal {
/// Spawning a running task failed.
#[fatal]
#[error("Spawning subsystem task failed: {0}")]
SpawnTask(#[source] SubsystemError),
/// Requester stream exhausted.
#[fatal]
#[error("Erasure chunk requester stream exhausted")]
RequesterExhausted,
#[fatal]
#[error("Receive channel closed: {0}")]
IncomingMessageChannel(#[source] SubsystemError),
/// Errors coming from runtime::Runtime.
#[fatal(forward)]
#[error("Error while accessing runtime information: {0}")]
Runtime(#[from] runtime::Fatal),
Runtime(#[from] runtime::Error),
#[fatal]
#[error("Oneshot for receiving response from Chain API got cancelled")]
ChainApiSenderDropped(#[source] oneshot::Canceled),
#[fatal]
#[error("Retrieving response from Chain API unexpectedly failed with error: {0}")]
ChainApi(#[from] ChainApiError),
}
/// Non-fatal errors of this subsystem.
#[derive(Debug, Error)]
pub enum NonFatal {
/// av-store will drop the sender on any error that happens.
// av-store will drop the sender on any error that happens.
#[error("Response channel to obtain chunk failed")]
QueryChunkResponseChannel(#[source] oneshot::Canceled),
/// av-store will drop the sender on any error that happens.
// av-store will drop the sender on any error that happens.
#[error("Response channel to obtain available data failed")]
QueryAvailableDataResponseChannel(#[source] oneshot::Canceled),
/// We tried accessing a session that was not cached.
// We tried accessing a session that was not cached.
#[error("Session {missing_session} is not cached, cached sessions: {available_sessions:?}.")]
NoSuchCachedSession { available_sessions: Vec<SessionIndex>, missing_session: SessionIndex },
/// Sending request response failed (Can happen on timeouts for example).
// Sending request response failed (Can happen on timeouts for example).
#[error("Sending a request's response failed.")]
SendResponse,
/// Fetching PoV failed with `RequestError`.
#[error("FetchPoV request error: {0}")]
FetchPoV(#[source] RequestError),
/// Fetching PoV failed as the received PoV did not match the expected hash.
#[error("Fetched PoV does not match expected hash")]
UnexpectedPoV,
#[error("Remote responded with `NoSuchPoV`")]
NoSuchPoV,
/// No validator with the index could be found in current session.
#[error("Given validator index could not be found")]
#[error("Given validator index could not be found in current session")]
InvalidValidatorIndex,
/// Errors coming from runtime::Runtime.
#[error("Error while accessing runtime information: {0}")]
Runtime(#[from] runtime::NonFatal),
}
/// General result type for fatal/nonfatal errors.
/// General result abbreviation type alias.
pub type Result<T> = std::result::Result<T, Error>;
/// Results which are never fatal.
pub type NonFatalResult<T> = std::result::Result<T, NonFatal>;
/// Utility for eating top level errors and log them.
///
/// We basically always want to try and continue on error. This utility function is meant to
/// consume top-level errors by simply logging them
pub fn log_error(result: Result<()>, ctx: &'static str) -> std::result::Result<(), Fatal> {
match result {
Err(Error::Fatal(f)) => Err(f),
Err(Error::NonFatal(error)) => {
match error {
NonFatal::UnexpectedPoV |
NonFatal::InvalidValidatorIndex |
NonFatal::NoSuchCachedSession { .. } |
NonFatal::QueryAvailableDataResponseChannel(_) |
NonFatal::QueryChunkResponseChannel(_) =>
tracing::warn!(target: LOG_TARGET, error = %error, ctx),
NonFatal::FetchPoV(_) |
NonFatal::SendResponse |
NonFatal::NoSuchPoV |
NonFatal::Runtime(_) => tracing::debug!(target: LOG_TARGET, error = ?error, ctx),
pub fn log_error(result: Result<()>, ctx: &'static str) -> std::result::Result<(), FatalError> {
match result.into_nested()? {
Ok(()) => Ok(()),
Err(jfyi) => {
match jfyi {
JfyiError::UnexpectedPoV |
JfyiError::InvalidValidatorIndex |
JfyiError::NoSuchCachedSession { .. } |
JfyiError::QueryAvailableDataResponseChannel(_) |
JfyiError::QueryChunkResponseChannel(_) =>
tracing::warn!(target: LOG_TARGET, error = %jfyi, ctx),
JfyiError::FetchPoV(_) |
JfyiError::SendResponse |
JfyiError::NoSuchPoV |
JfyiError::Runtime(_) => tracing::debug!(target: LOG_TARGET, error = ?jfyi, ctx),
}
Ok(())
},
Ok(()) => Ok(()),
}
}
@@ -26,7 +26,7 @@ use polkadot_subsystem::{
/// Error and [`Result`] type for this subsystem.
mod error;
use error::{log_error, Fatal, Result};
use error::{log_error, FatalError, Result};
use polkadot_node_subsystem_util::runtime::RuntimeInfo;
@@ -95,7 +95,7 @@ impl AvailabilityDistributionSubsystem {
}
/// Start processing work as passed on from the Overseer.
async fn run<Context>(self, mut ctx: Context) -> std::result::Result<(), Fatal>
async fn run<Context>(self, mut ctx: Context) -> std::result::Result<(), FatalError>
where
Context: SubsystemContext<Message = AvailabilityDistributionMessage>,
Context: overseer::SubsystemContext<Message = AvailabilityDistributionMessage>,
@@ -111,13 +111,13 @@ impl AvailabilityDistributionSubsystem {
"pov-receiver",
run_pov_receiver(sender.clone(), pov_req_receiver, metrics.clone()).boxed(),
)
.map_err(Fatal::SpawnTask)?;
.map_err(FatalError::SpawnTask)?;
ctx.spawn(
"chunk-receiver",
run_chunk_receiver(sender, chunk_req_receiver, metrics.clone()).boxed(),
)
.map_err(Fatal::SpawnTask)?;
.map_err(FatalError::SpawnTask)?;
}
loop {
@@ -132,9 +132,9 @@ impl AvailabilityDistributionSubsystem {
// Handle task messages sending:
let message = match action {
Either::Left(subsystem_msg) =>
subsystem_msg.map_err(|e| Fatal::IncomingMessageChannel(e))?,
subsystem_msg.map_err(|e| FatalError::IncomingMessageChannel(e))?,
Either::Right(from_task) => {
let from_task = from_task.ok_or(Fatal::RequesterExhausted)?;
let from_task = from_task.ok_or(FatalError::RequesterExhausted)?;
ctx.send_message(from_task).await;
continue
},
@@ -33,7 +33,7 @@ use polkadot_subsystem::{
};
use crate::{
error::{Fatal, NonFatal},
error::{Error, FatalError, JfyiError, Result},
metrics::{FAILED, NOT_FOUND, SUCCEEDED},
Metrics, LOG_TARGET,
};
@@ -48,7 +48,7 @@ pub async fn fetch_pov<Context>(
pov_hash: Hash,
tx: oneshot::Sender<PoV>,
metrics: Metrics,
) -> super::Result<()>
) -> Result<()>
where
Context: SubsystemContext,
{
@@ -56,7 +56,7 @@ where
let authority_id = info
.discovery_keys
.get(from_validator.0 as usize)
.ok_or(NonFatal::InvalidValidatorIndex)?
.ok_or(JfyiError::InvalidValidatorIndex)?
.clone();
let (req, pending_response) = OutgoingRequest::new(
Recipient::Authority(authority_id.clone()),
@@ -77,7 +77,7 @@ where
"pov-fetcher",
fetch_pov_job(pov_hash, authority_id, pending_response.boxed(), span, tx, metrics).boxed(),
)
.map_err(|e| Fatal::SpawnTask(e))?;
.map_err(|e| FatalError::SpawnTask(e))?;
Ok(())
}
@@ -85,7 +85,7 @@ where
async fn fetch_pov_job(
pov_hash: Hash,
authority_id: AuthorityDiscoveryId,
pending_response: BoxFuture<'static, Result<PoVFetchingResponse, RequestError>>,
pending_response: BoxFuture<'static, std::result::Result<PoVFetchingResponse, RequestError>>,
span: jaeger::Span,
tx: oneshot::Sender<PoV>,
metrics: Metrics,
@@ -98,17 +98,17 @@ async fn fetch_pov_job(
/// Do the actual work of waiting for the response.
async fn do_fetch_pov(
pov_hash: Hash,
pending_response: BoxFuture<'static, Result<PoVFetchingResponse, RequestError>>,
pending_response: BoxFuture<'static, std::result::Result<PoVFetchingResponse, RequestError>>,
_span: jaeger::Span,
tx: oneshot::Sender<PoV>,
metrics: Metrics,
) -> std::result::Result<(), NonFatal> {
let response = pending_response.await.map_err(NonFatal::FetchPoV);
) -> Result<()> {
let response = pending_response.await.map_err(Error::FetchPoV);
let pov = match response {
Ok(PoVFetchingResponse::PoV(pov)) => pov,
Ok(PoVFetchingResponse::NoSuchPoV) => {
metrics.on_fetched_pov(NOT_FOUND);
return Err(NonFatal::NoSuchPoV)
return Err(Error::NoSuchPoV)
},
Err(err) => {
metrics.on_fetched_pov(FAILED);
@@ -117,10 +117,10 @@ async fn do_fetch_pov(
};
if pov.hash() == pov_hash {
metrics.on_fetched_pov(SUCCEEDED);
tx.send(pov).map_err(|_| NonFatal::SendResponse)
tx.send(pov).map_err(|_| Error::SendResponse)
} else {
metrics.on_fetched_pov(FAILED);
Err(NonFatal::UnexpectedPoV)
Err(Error::UnexpectedPoV)
}
}
@@ -39,7 +39,7 @@ use polkadot_subsystem::{
};
use crate::{
error::{Fatal, Result},
error::{FatalError, Result},
metrics::{Metrics, FAILED, SUCCEEDED},
requester::session_cache::{BadValidators, SessionInfo},
LOG_TARGET,
@@ -185,7 +185,7 @@ impl FetchTask {
let (handle, kill) = oneshot::channel();
ctx.spawn("chunk-fetcher", running.run(kill).boxed())
.map_err(|e| Fatal::SpawnTask(e))?;
.map_err(|e| FatalError::SpawnTask(e))?;
Ok(FetchTask { live_in, state: FetchedState::Started(handle) })
} else {
@@ -39,8 +39,7 @@ use polkadot_subsystem::{
ActivatedLeaf, ActiveLeavesUpdate, LeafStatus, SubsystemContext,
};
use super::{Metrics, Result, LOG_TARGET};
use crate::error::Fatal;
use super::{FatalError, Metrics, Result, LOG_TARGET};
#[cfg(test)]
mod tests;
@@ -324,6 +323,9 @@ where
})
.await;
let ancestors = rx.await.map_err(Fatal::ChainApiSenderDropped)?.map_err(Fatal::ChainApi)?;
let ancestors = rx
.await
.map_err(FatalError::ChainApiSenderDropped)?
.map_err(FatalError::ChainApi)?;
Ok(ancestors)
}
@@ -26,7 +26,7 @@ use polkadot_primitives::v1::{
use polkadot_subsystem::SubsystemContext;
use crate::{
error::{Error, NonFatal},
error::{Error, Result},
LOG_TARGET,
};
@@ -100,7 +100,7 @@ impl SessionCache {
runtime: &mut RuntimeInfo,
parent: Hash,
with_info: F,
) -> Result<Option<R>, Error>
) -> Result<Option<R>>
where
Context: SubsystemContext,
F: FnOnce(&SessionInfo) -> R,
@@ -143,10 +143,10 @@ impl SessionCache {
///
/// We assume validators in a group are tried in reverse order, so the reported bad validators
/// will be put at the beginning of the group.
pub fn report_bad(&mut self, report: BadValidators) -> crate::Result<()> {
pub fn report_bad(&mut self, report: BadValidators) -> Result<()> {
let available_sessions = self.session_info_cache.iter().map(|(k, _)| *k).collect();
let session = self.session_info_cache.get_mut(&report.session_index).ok_or(
NonFatal::NoSuchCachedSession {
Error::NoSuchCachedSession {
available_sessions,
missing_session: report.session_index,
},
@@ -179,7 +179,7 @@ impl SessionCache {
runtime: &mut RuntimeInfo,
relay_parent: Hash,
session_index: SessionIndex,
) -> Result<Option<SessionInfo>, Error>
) -> Result<Option<SessionInfo>>
where
Context: SubsystemContext,
{
@@ -20,8 +20,9 @@ use std::sync::Arc;
use futures::channel::oneshot;
use fatality::Nested;
use polkadot_node_network_protocol::{
request_response::{incoming, v1, IncomingRequest, IncomingRequestReceiver},
request_response::{v1, IncomingRequest, IncomingRequestReceiver},
UnifiedReputationChange as Rep,
};
use polkadot_node_primitives::{AvailableData, ErasureChunk};
@@ -29,7 +30,7 @@ use polkadot_primitives::v1::{CandidateHash, ValidatorIndex};
use polkadot_subsystem::{jaeger, messages::AvailabilityStoreMessage, SubsystemSender};
use crate::{
error::{NonFatal, NonFatalResult, Result},
error::{JfyiError, Result},
metrics::{Metrics, FAILED, NOT_FOUND, SUCCEEDED},
LOG_TARGET,
};
@@ -45,20 +46,20 @@ pub async fn run_pov_receiver<Sender>(
Sender: SubsystemSender,
{
loop {
match receiver.recv(|| vec![COST_INVALID_REQUEST]).await {
Ok(msg) => {
match receiver.recv(|| vec![COST_INVALID_REQUEST]).await.into_nested() {
Ok(Ok(msg)) => {
answer_pov_request_log(&mut sender, msg, &metrics).await;
},
Err(incoming::Error::Fatal(f)) => {
Err(fatal) => {
tracing::debug!(
target: LOG_TARGET,
error = ?f,
error = ?fatal,
"Shutting down POV receiver."
);
return
},
Err(incoming::Error::NonFatal(error)) => {
tracing::debug!(target: LOG_TARGET, ?error, "Error decoding incoming PoV request.");
Ok(Err(jfyi)) => {
tracing::debug!(target: LOG_TARGET, error = ?jfyi, "Error decoding incoming PoV request.");
},
}
}
@@ -73,22 +74,22 @@ pub async fn run_chunk_receiver<Sender>(
Sender: SubsystemSender,
{
loop {
match receiver.recv(|| vec![COST_INVALID_REQUEST]).await {
Ok(msg) => {
match receiver.recv(|| vec![COST_INVALID_REQUEST]).await.into_nested() {
Ok(Ok(msg)) => {
answer_chunk_request_log(&mut sender, msg, &metrics).await;
},
Err(incoming::Error::Fatal(f)) => {
Err(fatal) => {
tracing::debug!(
target: LOG_TARGET,
error = ?f,
error = ?fatal,
"Shutting down chunk receiver."
);
return
},
Err(incoming::Error::NonFatal(error)) => {
Ok(Err(jfyi)) => {
tracing::debug!(
target: LOG_TARGET,
?error,
error = ?jfyi,
"Error decoding incoming chunk request."
);
},
@@ -169,7 +170,7 @@ where
},
};
req.send_response(response).map_err(|_| NonFatal::SendResponse)?;
req.send_response(response).map_err(|_| JfyiError::SendResponse)?;
Ok(result)
}
@@ -205,7 +206,7 @@ where
Some(chunk) => v1::ChunkFetchingResponse::Chunk(chunk.into()),
};
req.send_response(response).map_err(|_| NonFatal::SendResponse)?;
req.send_response(response).map_err(|_| JfyiError::SendResponse)?;
Ok(result)
}
@@ -214,7 +215,7 @@ async fn query_chunk<Sender>(
sender: &mut Sender,
candidate_hash: CandidateHash,
validator_index: ValidatorIndex,
) -> NonFatalResult<Option<ErasureChunk>>
) -> std::result::Result<Option<ErasureChunk>, JfyiError>
where
Sender: SubsystemSender,
{
@@ -233,7 +234,7 @@ where
error = ?e,
"Error retrieving chunk",
);
NonFatal::QueryChunkResponseChannel(e)
JfyiError::QueryChunkResponseChannel(e)
})?;
Ok(result)
}
@@ -242,7 +243,7 @@ where
async fn query_available_data<Sender>(
sender: &mut Sender,
candidate_hash: CandidateHash,
) -> NonFatalResult<Option<AvailableData>>
) -> Result<Option<AvailableData>>
where
Sender: SubsystemSender,
{
@@ -251,6 +252,6 @@ where
.send_message(AvailabilityStoreMessage::QueryAvailableData(candidate_hash, tx).into())
.await;
let result = rx.await.map_err(|e| NonFatal::QueryAvailableDataResponseChannel(e))?;
let result = rx.await.map_err(JfyiError::QueryAvailableDataResponseChannel)?;
Ok(result)
}