fatality based errors (#4448)

* seed commit for fatality based errors

* fatality

* first draft of fatality

* cleanup

* differnt approach

* simplify

* first working version for enums, with documentation

* add split

* fix simple split test case

* extend README.md

* update fatality impl

* make tests passed

* apply fatality to first subsystem

* fatality fixes

* use fatality in a subsystem

* fix subsystemg

* fixup proc macro

* fix/test: log::*! do not execute when log handler is missing

* fix spelling

* rename Runtime2 to something sane

* allow nested split with `forward` annotations

* add free license

* enable and fixup all tests

* use external fatality

Makes this more reviewable.

* bump fatality dep

Avoid duplicate expander compilations.

* migrate availability distribution

* more fatality usage

* chore: bump fatality to 0.0.6

* fixup remaining subsystems

* chore: fmt

* make cargo spellcheck happy

* remove single instance of `#[fatal(false)]`

* last quality sweep

* fixup
This commit is contained in:
Bernhard Schuster
2022-02-25 18:25:26 +01:00
committed by GitHub
parent 85fa087405
commit d946582707
48 changed files with 425 additions and 659 deletions
@@ -11,6 +11,7 @@ parity-scale-codec = "3.0.0"
kvdb = "0.11.0"
thiserror = "1.0.30"
lru = "0.7.2"
fatality = "0.0.6"
polkadot-primitives = { path = "../../../primitives" }
polkadot-node-primitives = { path = "../../primitives" }
@@ -19,6 +20,7 @@ polkadot-node-subsystem-util = { path = "../../subsystem-util" }
sc-keystore = { git = "https://github.com/paritytech/substrate", branch = "master" }
[dev-dependencies]
kvdb-memorydb = "0.11.0"
polkadot-node-subsystem-test-helpers = { path = "../../subsystem-test-helpers" }
@@ -24,7 +24,8 @@ use polkadot_primitives::v1::BlockNumber;
use futures::prelude::*;
use crate::error::{Error, Result};
use crate::error::Result;
use fatality::Nested;
const LOG_TARGET: &str = "parachain::dispute-coordinator";
@@ -62,13 +63,16 @@ where
{
loop {
let res = run_until_error(&mut ctx, &subsystem).await;
match res {
Err(e) =>
if let Error::Fatal(_) = e {
break
},
Ok(()) => {
tracing::info!(target: LOG_TARGET, "received `Conclude` signal, exiting");
match res.into_nested() {
Err(fatal) => {
tracing::error!(target: LOG_TARGET, "Observed fatal issue: {:?}", fatal);
break
},
Ok(Err(jfyi)) => {
tracing::debug!(target: LOG_TARGET, "Observed issue: {:?}", jfyi);
},
Ok(Ok(())) => {
tracing::info!(target: LOG_TARGET, "Received `Conclude` signal, exiting");
break
},
}
@@ -14,90 +14,64 @@
// You should have received a copy of the GNU General Public License
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
use fatality::Nested;
use futures::channel::oneshot;
use thiserror::Error;
use polkadot_node_subsystem::{
errors::{ChainApiError, RuntimeApiError},
SubsystemError,
};
use polkadot_node_subsystem::{errors::ChainApiError, SubsystemError};
use polkadot_node_subsystem_util::{rolling_session_window::SessionsUnavailable, runtime};
use crate::LOG_TARGET;
use crate::{real::participation, LOG_TARGET};
use parity_scale_codec::Error as CodecError;
/// Errors for this subsystem.
#[derive(Debug, Error)]
#[error(transparent)]
pub type Result<T> = std::result::Result<T, Error>;
pub type FatalResult<T> = std::result::Result<T, FatalError>;
pub type JfyiResult<T> = std::result::Result<T, JfyiError>;
#[allow(missing_docs)]
#[fatality::fatality(splitable)]
pub enum Error {
/// All fatal errors.
Fatal(#[from] Fatal),
/// All nonfatal/potentially recoverable errors.
NonFatal(#[from] NonFatal),
}
/// General `Result` type for dispute coordinator.
pub type Result<R> = std::result::Result<R, Error>;
/// Result type with only fatal errors.
pub type FatalResult<R> = std::result::Result<R, Fatal>;
/// Result type with only non fatal errors.
pub type NonFatalResult<R> = std::result::Result<R, NonFatal>;
impl From<runtime::Error> for Error {
fn from(o: runtime::Error) -> Self {
match o {
runtime::Error::Fatal(f) => Self::Fatal(Fatal::Runtime(f)),
runtime::Error::NonFatal(f) => Self::NonFatal(NonFatal::Runtime(f)),
}
}
}
impl From<SubsystemError> for Error {
fn from(o: SubsystemError) -> Self {
match o {
SubsystemError::Context(msg) => Self::Fatal(Fatal::SubsystemContext(msg)),
_ => Self::NonFatal(NonFatal::Subsystem(o)),
}
}
}
/// Fatal errors of this subsystem.
#[derive(Debug, Error)]
pub enum Fatal {
/// Errors coming from runtime::Runtime.
#[error("Error while accessing runtime information {0}")]
Runtime(#[from] runtime::Fatal),
/// We received a legacy `SubystemError::Context` error which is considered fatal.
#[fatal]
#[error("SubsystemError::Context error: {0}")]
SubsystemContext(String),
/// `ctx.spawn` failed with an error.
#[fatal]
#[error("Spawning a task failed: {0}")]
SpawnFailed(SubsystemError),
SpawnFailed(#[source] SubsystemError),
#[fatal]
#[error("Participation worker receiver exhausted.")]
ParticipationWorkerReceiverExhausted,
/// Receiving subsystem message from overseer failed.
#[fatal]
#[error("Receiving message from overseer failed: {0}")]
SubsystemReceive(#[source] SubsystemError),
#[fatal]
#[error("Writing to database failed: {0}")]
DbWriteFailed(std::io::Error),
#[error("Oneshot for receiving response from chain API got cancelled")]
#[fatal]
#[error("Oneshot for receiving block number from chain API got cancelled")]
CanceledBlockNumber,
#[fatal]
#[error("Retrieving block number from chain API failed with error: {0}")]
ChainApiBlockNumber(ChainApiError),
#[fatal]
#[error(transparent)]
ChainApiAncestors(ChainApiError),
#[fatal]
#[error("Chain API dropped response channel sender")]
ChainApiSenderDropped,
#[error("Retrieving response from chain API unexpectedly failed with error: {0}")]
ChainApi(#[from] ChainApiError),
}
#[derive(Debug, thiserror::Error)]
#[allow(missing_docs)]
pub enum NonFatal {
#[error(transparent)]
RuntimeApi(#[from] RuntimeApiError),
#[fatal(forward)]
#[error("Error while accessing runtime information {0}")]
Runtime(#[from] runtime::Error),
#[error(transparent)]
ChainApi(#[from] ChainApiError),
@@ -112,7 +86,7 @@ pub enum NonFatal {
DisputeImportOneshotSend,
#[error(transparent)]
Subsystem(SubsystemError),
Subsystem(#[from] SubsystemError),
#[error(transparent)]
Codec(#[from] CodecError),
@@ -121,36 +95,32 @@ pub enum NonFatal {
#[error("Sessions unavailable in `RollingSessionWindow`: {0}")]
RollingSessionWindow(#[from] SessionsUnavailable),
/// Errors coming from runtime::Runtime.
#[error("Error while accessing runtime information: {0}")]
Runtime(#[from] runtime::NonFatal),
#[error(transparent)]
QueueError(#[from] crate::real::participation::QueueError),
QueueError(#[from] participation::QueueError),
}
/// Utility for eating top level errors and log them.
///
/// We basically always want to try and continue on error. This utility function is meant to
/// consume top-level errors by simply logging them
pub fn log_error(result: Result<()>) -> std::result::Result<(), Fatal> {
match result {
Err(Error::Fatal(f)) => Err(f),
Err(Error::NonFatal(error)) => {
error.log();
pub fn log_error(result: Result<()>) -> std::result::Result<(), FatalError> {
match result.into_nested()? {
Ok(()) => Ok(()),
Err(jfyi) => {
jfyi.log();
Ok(())
},
Ok(()) => Ok(()),
}
}
impl NonFatal {
/// Log a `NonFatal`.
impl JfyiError {
/// Log a `JfyiError`.
pub fn log(self) {
match self {
// don't spam the log with spurious errors
Self::RuntimeApi(_) | Self::Oneshot(_) =>
tracing::debug!(target: LOG_TARGET, error = ?self),
Self::Runtime(_) | Self::Oneshot(_) => {
tracing::debug!(target: LOG_TARGET, error = ?self)
},
// it's worth reporting otherwise
_ => tracing::warn!(target: LOG_TARGET, error = ?self),
}
@@ -28,15 +28,14 @@ use kvdb::{DBTransaction, KeyValueDB};
use parity_scale_codec::{Decode, Encode};
use crate::{
error::{Fatal, FatalResult},
error::{FatalError, FatalResult},
real::{
backend::{Backend, BackendWriteOp, OverlayedBackend},
DISPUTE_WINDOW,
},
status::DisputeStatus,
};
use crate::real::{
backend::{Backend, BackendWriteOp, OverlayedBackend},
DISPUTE_WINDOW,
};
const RECENT_DISPUTES_KEY: &[u8; 15] = b"recent-disputes";
const EARLIEST_SESSION_KEY: &[u8; 16] = b"earliest-session";
const CANDIDATE_VOTES_SUBKEY: &[u8; 15] = b"candidate-votes";
@@ -100,7 +99,7 @@ impl Backend for DbBackend {
}
}
self.inner.write(tx).map_err(Fatal::DbWriteFailed)
self.inner.write(tx).map_err(FatalError::DbWriteFailed)
}
}
@@ -168,8 +167,8 @@ pub enum Error {
impl From<Error> for crate::error::Error {
fn from(err: Error) -> Self {
match err {
Error::Io(io) => Self::NonFatal(crate::error::NonFatal::Io(io)),
Error::Codec(e) => Self::NonFatal(crate::error::NonFatal::Codec(e)),
Error::Io(io) => Self::Io(io),
Error::Codec(e) => Self::Codec(e),
}
}
}
@@ -53,7 +53,7 @@ use polkadot_primitives::{
};
use crate::{
error::{log_error, Error, Fatal, FatalResult, NonFatal, NonFatalResult, Result},
error::{log_error, Error, FatalError, FatalResult, JfyiError, JfyiResult, Result},
metrics::Metrics,
real::{ordering::get_finalized_block_number, DisputeCoordinatorSubsystem},
status::{get_active_with_status, Clock, DisputeStatus, Timestamp},
@@ -610,7 +610,7 @@ impl Initialized {
overlay_db: &mut OverlayedBackend<'_, impl Backend>,
message: DisputeCoordinatorMessage,
now: Timestamp,
) -> Result<Box<dyn FnOnce() -> NonFatalResult<()>>> {
) -> Result<Box<dyn FnOnce() -> JfyiResult<()>>> {
match message {
DisputeCoordinatorMessage::ImportStatements {
candidate_hash,
@@ -633,7 +633,7 @@ impl Initialized {
let report = move || {
pending_confirmation
.send(outcome)
.map_err(|_| NonFatal::DisputeImportOneshotSend)
.map_err(|_| JfyiError::DisputeImportOneshotSend)
};
match outcome {
ImportStatementsResult::InvalidImport => {
@@ -733,7 +733,7 @@ impl Initialized {
// Helper function for checking subsystem errors in message processing.
fn ensure_available_session_info(&self) -> Result<()> {
if let Some(subsystem_error) = self.error.clone() {
return Err(Error::NonFatal(NonFatal::RollingSessionWindow(subsystem_error)))
return Err(Error::RollingSessionWindow(subsystem_error))
}
Ok(())
@@ -1174,8 +1174,8 @@ impl MuxedMessage {
let from_overseer = ctx.recv().fuse();
futures::pin_mut!(from_overseer, from_sender);
futures::select!(
msg = from_overseer => Ok(Self::Subsystem(msg.map_err(Fatal::SubsystemReceive)?)),
msg = from_sender.next() => Ok(Self::Participation(msg.ok_or(Fatal::ParticipationWorkerReceiverExhausted)?)),
msg = from_overseer => Ok(Self::Subsystem(msg.map_err(FatalError::SubsystemReceive)?)),
msg = from_sender.next() => Ok(Self::Participation(msg.ok_or(FatalError::ParticipationWorkerReceiverExhausted)?)),
)
}
}
@@ -40,13 +40,13 @@ use polkadot_node_subsystem_util::rolling_session_window::RollingSessionWindow;
use polkadot_primitives::v1::{ValidatorIndex, ValidatorPair};
use crate::{
error::{Error, FatalResult, NonFatal, Result},
error::{FatalResult, JfyiError, Result},
metrics::Metrics,
status::{get_active_with_status, SystemClock},
};
use backend::{Backend, OverlayedBackend};
use db::v1::DbBackend;
use fatality::Split;
use self::{
ordering::CandidateComparator,
@@ -196,9 +196,8 @@ impl DisputeCoordinatorSubsystem {
tracing::info!(target: LOG_TARGET, "received `Conclude` signal, exiting");
return Ok(None)
},
Err(Error::Fatal(f)) => return Err(f),
Err(Error::NonFatal(e)) => {
e.log();
Err(e) => {
e.split()?.log();
continue
},
};
@@ -219,9 +218,8 @@ impl DisputeCoordinatorSubsystem {
.await
{
Ok(v) => v,
Err(Error::Fatal(f)) => return Err(f),
Err(Error::NonFatal(e)) => {
e.log();
Err(e) => {
e.split()?.log();
continue
},
};
@@ -371,7 +369,7 @@ where
leaf.clone(),
RollingSessionWindow::new(ctx, DISPUTE_WINDOW, leaf.hash)
.await
.map_err(NonFatal::RollingSessionWindow)?,
.map_err(JfyiError::RollingSessionWindow)?,
)))
} else {
Ok(None)
@@ -401,11 +399,13 @@ where
// available). So instead of telling subsystems, everything is fine, because of an
// hour old database state, we should rather cancel contained oneshots and delay
// finality until we are fully functional.
{
tracing::warn!(
target: LOG_TARGET,
?msg,
"Received msg before first active leaves update. This is not expected - message will be dropped."
),
)
},
}
}
}
@@ -29,7 +29,7 @@ use polkadot_node_subsystem_util::runtime::get_candidate_events;
use polkadot_primitives::v1::{BlockNumber, CandidateEvent, CandidateHash, CandidateReceipt, Hash};
use crate::{
error::{Fatal, FatalResult, Result},
error::{FatalError, FatalResult, Result},
LOG_TARGET,
};
@@ -182,7 +182,7 @@ impl OrderingProvider {
&mut self,
sender: &mut Sender,
update: &ActiveLeavesUpdate,
) -> Result<()> {
) -> crate::error::Result<()> {
if let Some(activated) = update.activated.as_ref() {
// Fetch last finalized block.
let ancestors = match get_finalized_block_number(sender).await {
@@ -299,7 +299,9 @@ impl OrderingProvider {
)
.await;
rx.await.or(Err(Fatal::ChainApiSenderDropped))?.map_err(Fatal::ChainApi)?
rx.await
.or(Err(FatalError::ChainApiSenderDropped))?
.map_err(FatalError::ChainApiAncestors)?
};
let earliest_block_number = match head_number.checked_sub(hashes.len() as u32) {
@@ -356,8 +358,8 @@ where
receiver
.await
.map_err(|_| Fatal::ChainApiSenderDropped)?
.map_err(Fatal::ChainApi)
.map_err(|_| FatalError::ChainApiSenderDropped)?
.map_err(FatalError::ChainApiAncestors)
}
async fn get_block_number(
@@ -29,12 +29,10 @@ use polkadot_node_subsystem::{
use polkadot_node_subsystem_util::runtime::get_validation_code_by_hash;
use polkadot_primitives::v1::{BlockNumber, CandidateHash, CandidateReceipt, Hash, SessionIndex};
use crate::{
error::{Fatal, FatalResult, NonFatal, Result},
LOG_TARGET,
};
use crate::real::LOG_TARGET;
use super::ordering::CandidateComparator;
use crate::error::{FatalError, FatalResult, JfyiError, Result};
#[cfg(test)]
mod tests;
@@ -43,7 +41,7 @@ pub use tests::{participation_full_happy_path, participation_missing_availabilit
mod queues;
use queues::Queues;
pub use queues::{Error as QueueError, ParticipationRequest};
pub use queues::{ParticipationRequest, QueueError};
/// How many participation processes do we want to run in parallel the most.
///
@@ -161,7 +159,7 @@ impl Participation {
}
}
// Out of capacity/no recent block yet - queue:
Ok(self.queue.queue(comparator, req).map_err(NonFatal::QueueError)?)
Ok(self.queue.queue(comparator, req).map_err(JfyiError::QueueError)?)
}
/// Message from a worker task was received - get the outcome.
@@ -239,7 +237,7 @@ impl Participation {
"participation-worker",
participate(self.worker_sender.clone(), sender, recent_head, req).boxed(),
)
.map_err(Fatal::SpawnFailed)?;
.map_err(FatalError::SpawnFailed)?;
}
Ok(())
}
@@ -16,8 +16,6 @@
use std::collections::{BTreeMap, HashMap};
use thiserror::Error;
use polkadot_primitives::v1::{CandidateHash, CandidateReceipt, SessionIndex};
use crate::real::ordering::CandidateComparator;
@@ -83,8 +81,8 @@ struct BestEffortEntry {
}
/// What can go wrong when queuing a request.
#[derive(Debug, Error)]
pub enum Error {
#[derive(Debug, thiserror::Error)]
pub enum QueueError {
#[error("Request could not be queued, because best effort queue was already full.")]
BestEffortFull,
#[error("Request could not be queued, because priority queue was already full.")]
@@ -137,21 +135,21 @@ impl Queues {
&mut self,
comparator: Option<CandidateComparator>,
req: ParticipationRequest,
) -> Result<(), Error> {
) -> Result<(), QueueError> {
debug_assert!(comparator
.map(|c| c.matches_candidate(req.candidate_hash()))
.unwrap_or(true));
if let Some(comparator) = comparator {
if self.priority.len() >= PRIORITY_QUEUE_SIZE {
return Err(Error::PriorityFull)
return Err(QueueError::PriorityFull)
}
// Remove any best effort entry:
self.best_effort.remove(&req.candidate_hash);
self.priority.insert(comparator, req);
} else {
if self.best_effort.len() >= BEST_EFFORT_QUEUE_SIZE {
return Err(Error::BestEffortFull)
return Err(QueueError::BestEffortFull)
}
// Note: The request might have been added to priority in a previous call already, we
// take care of that case in `dequeue` (more efficient).
@@ -20,7 +20,7 @@ use polkadot_primitives::v1::{BlockNumber, Hash};
use crate::real::ordering::CandidateComparator;
use super::{Error, ParticipationRequest, Queues};
use super::{ParticipationRequest, QueueError, Queues};
/// Make a `ParticipationRequest` based on the given commitments hash.
fn make_participation_request(hash: Hash) -> ParticipationRequest {
@@ -64,9 +64,9 @@ fn ordering_works_as_expected() {
queue.queue(None, req5.clone()).unwrap();
assert_matches!(
queue.queue(Some(make_dummy_comparator(&req_prio_full, 3)), req_prio_full),
Err(Error::PriorityFull)
Err(QueueError::PriorityFull)
);
assert_matches!(queue.queue(None, req_full), Err(Error::BestEffortFull));
assert_matches!(queue.queue(None, req_full), Err(QueueError::BestEffortFull));
assert_eq!(queue.dequeue(), Some(req_prio));
assert_eq!(queue.dequeue(), Some(req_prio_2));
@@ -64,14 +64,14 @@ impl SpamSlots {
let mut slots: HashMap<(SessionIndex, ValidatorIndex), SpamCount> = HashMap::new();
for ((session, _), validators) in unconfirmed_disputes.iter() {
for validator in validators {
let e = slots.entry((*session, *validator)).or_default();
*e += 1;
if *e > MAX_SPAM_VOTES {
let spam_vote_count = slots.entry((*session, *validator)).or_default();
*spam_vote_count += 1;
if *spam_vote_count > MAX_SPAM_VOTES {
tracing::debug!(
target: LOG_TARGET,
?session,
?validator,
count = ?e,
count = ?spam_vote_count,
"Import exceeded spam slot for validator"
);
}
@@ -93,8 +93,8 @@ impl SpamSlots {
candidate: CandidateHash,
validator: ValidatorIndex,
) -> bool {
let c = self.slots.entry((session, validator)).or_default();
if *c >= MAX_SPAM_VOTES {
let spam_vote_count = self.slots.entry((session, validator)).or_default();
if *spam_vote_count >= MAX_SPAM_VOTES {
return false
}
let validators = self.unconfirmed.entry((session, candidate)).or_default();
@@ -103,7 +103,7 @@ impl SpamSlots {
// We only increment spam slots once per candidate, as each validator has to provide an
// opposing vote for sending out its own vote. Therefore, receiving multiple votes for
// a single candidate is expected and should not get punished here.
*c += 1;
*spam_vote_count += 1;
}
true
@@ -118,8 +118,8 @@ impl SpamSlots {
if let Some(validators) = self.unconfirmed.remove(key) {
let (session, _) = key;
for validator in validators {
if let Some(c) = self.slots.remove(&(*session, validator)) {
let new = c - 1;
if let Some(spam_vote_count) = self.slots.remove(&(*session, validator)) {
let new = spam_vote_count - 1;
if new > 0 {
self.slots.insert((*session, validator), new);
}
@@ -117,8 +117,8 @@ impl sp_inherents::InherentDataProvider for ParachainsInherentDataProvider {
async fn try_handle_error(
&self,
_: &sp_inherents::InherentIdentifier,
_: &[u8],
_identifier: &sp_inherents::InherentIdentifier,
_error: &[u8],
) -> Option<Result<(), sp_inherents::Error>> {
// Inherent isn't checked and can not return any error
None