Limit number of blocks per level (2nd attempt) (#1559)

Prevents the StateDbError::TooManySiblingBlocks error from being triggered by eagerly removing 
stale blocks from the backend on block import and before the error condition is met.

Introduces a just in time block recovery mechanism for blocks that were wrongly removed
via an explicit pov-recovery method

Co-authored-by: Bastian Köcher <bkchr@users.noreply.github.com>
This commit is contained in:
Davide Galassi
2022-12-20 12:13:49 +01:00
committed by GitHub
parent 79d8c5c3b8
commit 030ba80ba0
17 changed files with 1096 additions and 184 deletions
@@ -15,7 +15,6 @@
// along with Cumulus. If not, see <http://www.gnu.org/licenses/>.
use async_trait::async_trait;
use cumulus_relay_chain_interface::{RelayChainInterface, RelayChainResult};
use sc_client_api::{
Backend, BlockBackend, BlockImportNotification, BlockchainEvents, Finalizer, UsageProvider,
};
@@ -27,15 +26,25 @@ use sp_runtime::{
traits::{Block as BlockT, Header as HeaderT},
};
use cumulus_client_pov_recovery::{RecoveryDelay, RecoveryKind, RecoveryRequest};
use cumulus_relay_chain_interface::{RelayChainInterface, RelayChainResult};
use polkadot_primitives::v2::{Hash as PHash, Id as ParaId, OccupiedCoreAssumption};
use codec::Decode;
use futures::{select, FutureExt, Stream, StreamExt};
use futures::{channel::mpsc::Sender, select, FutureExt, Stream, StreamExt};
use std::{pin::Pin, sync::Arc};
use std::{pin::Pin, sync::Arc, time::Duration};
const LOG_TARGET: &str = "cumulus-consensus";
// Delay range to trigger explicit requests.
// The chosen value doesn't have any special meaning, a random delay within the order of
// seconds in practice should be a good enough to allow a quick recovery without DOSing
// the relay chain.
const RECOVERY_DELAY: RecoveryDelay =
RecoveryDelay { min: Duration::ZERO, max: Duration::from_secs(30) };
/// Helper for the relay chain client. This is expected to be a lightweight handle like an `Arc`.
#[async_trait]
pub trait RelaychainClient: Clone + 'static {
@@ -82,7 +91,7 @@ where
let finalized_head = if let Some(h) = finalized_heads.next().await {
h
} else {
tracing::debug!(target: "cumulus-consensus", "Stopping following finalized head.");
tracing::debug!(target: LOG_TARGET, "Stopping following finalized head.");
return
};
@@ -90,7 +99,7 @@ where
Ok(header) => header,
Err(err) => {
tracing::debug!(
target: "cumulus-consensus",
target: LOG_TARGET,
error = ?err,
"Could not decode parachain header while following finalized heads.",
);
@@ -105,12 +114,12 @@ where
if let Err(e) = parachain.finalize_block(hash, None, true) {
match e {
ClientError::UnknownBlock(_) => tracing::debug!(
target: "cumulus-consensus",
target: LOG_TARGET,
block_hash = ?hash,
"Could not finalize block because it is unknown.",
),
_ => tracing::warn!(
target: "cumulus-consensus",
target: LOG_TARGET,
error = ?e,
block_hash = ?hash,
"Failed to finalize block",
@@ -136,6 +145,7 @@ pub async fn run_parachain_consensus<P, R, Block, B>(
parachain: Arc<P>,
relay_chain: R,
announce_block: Arc<dyn Fn(Block::Hash, Option<Vec<u8>>) + Send + Sync>,
recovery_chan_tx: Option<Sender<RecoveryRequest<Block>>>,
) where
Block: BlockT,
P: Finalizer<Block, B>
@@ -148,8 +158,13 @@ pub async fn run_parachain_consensus<P, R, Block, B>(
R: RelaychainClient,
B: Backend<Block>,
{
let follow_new_best =
follow_new_best(para_id, parachain.clone(), relay_chain.clone(), announce_block);
let follow_new_best = follow_new_best(
para_id,
parachain.clone(),
relay_chain.clone(),
announce_block,
recovery_chan_tx,
);
let follow_finalized_head = follow_finalized_head(para_id, parachain, relay_chain);
select! {
_ = follow_new_best.fuse() => {},
@@ -163,6 +178,7 @@ async fn follow_new_best<P, R, Block, B>(
parachain: Arc<P>,
relay_chain: R,
announce_block: Arc<dyn Fn(Block::Hash, Option<Vec<u8>>) + Send + Sync>,
recovery_chan_tx: Option<Sender<RecoveryRequest<Block>>>,
) where
Block: BlockT,
P: Finalizer<Block, B>
@@ -197,10 +213,11 @@ async fn follow_new_best<P, R, Block, B>(
h,
&*parachain,
&mut unset_best_header,
recovery_chan_tx.clone(),
).await,
None => {
tracing::debug!(
target: "cumulus-consensus",
target: LOG_TARGET,
"Stopping following new best.",
);
return
@@ -217,7 +234,7 @@ async fn follow_new_best<P, R, Block, B>(
).await,
None => {
tracing::debug!(
target: "cumulus-consensus",
target: LOG_TARGET,
"Stopping following imported blocks.",
);
return
@@ -276,7 +293,7 @@ async fn handle_new_block_imported<Block, P>(
import_block_as_new_best(unset_hash, unset_best_header, parachain).await;
},
state => tracing::debug!(
target: "cumulus-consensus",
target: LOG_TARGET,
?unset_best_header,
?notification.header,
?state,
@@ -290,6 +307,7 @@ async fn handle_new_best_parachain_head<Block, P>(
head: Vec<u8>,
parachain: &P,
unset_best_header: &mut Option<Block::Header>,
mut recovery_chan_tx: Option<Sender<RecoveryRequest<Block>>>,
) where
Block: BlockT,
P: UsageProvider<Block> + Send + Sync + BlockBackend<Block>,
@@ -299,7 +317,7 @@ async fn handle_new_best_parachain_head<Block, P>(
Ok(header) => header,
Err(err) => {
tracing::debug!(
target: "cumulus-consensus",
target: LOG_TARGET,
error = ?err,
"Could not decode Parachain header while following best heads.",
);
@@ -311,7 +329,7 @@ async fn handle_new_best_parachain_head<Block, P>(
if parachain.usage_info().chain.best_hash == hash {
tracing::debug!(
target: "cumulus-consensus",
target: LOG_TARGET,
block_hash = ?hash,
"Skipping set new best block, because block is already the best.",
)
@@ -325,7 +343,7 @@ async fn handle_new_best_parachain_head<Block, P>(
},
Ok(BlockStatus::InChainPruned) => {
tracing::error!(
target: "cumulus-collator",
target: LOG_TARGET,
block_hash = ?hash,
"Trying to set pruned block as new best!",
);
@@ -334,14 +352,30 @@ async fn handle_new_best_parachain_head<Block, P>(
*unset_best_header = Some(parachain_head);
tracing::debug!(
target: "cumulus-collator",
target: LOG_TARGET,
block_hash = ?hash,
"Parachain block not yet imported, waiting for import to enact as best block.",
);
if let Some(ref mut recovery_chan_tx) = recovery_chan_tx {
// Best effort channel to actively encourage block recovery.
// An error here is not fatal; the relay chain continuously re-announces
// the best block, thus we will have other opportunities to retry.
let req =
RecoveryRequest { hash, delay: RECOVERY_DELAY, kind: RecoveryKind::Full };
if let Err(err) = recovery_chan_tx.try_send(req) {
tracing::warn!(
target: LOG_TARGET,
block_hash = ?hash,
error = ?err,
"Unable to notify block recovery subsystem"
)
}
}
},
Err(e) => {
tracing::error!(
target: "cumulus-collator",
target: LOG_TARGET,
block_hash = ?hash,
error = ?e,
"Failed to get block status of block.",
@@ -361,7 +395,7 @@ where
let best_number = parachain.usage_info().chain.best_number;
if *header.number() < best_number {
tracing::debug!(
target: "cumulus-consensus",
target: LOG_TARGET,
%best_number,
block_number = %header.number(),
"Skipping importing block as new best block, because there already exists a \
@@ -377,7 +411,7 @@ where
if let Err(err) = (&*parachain).import_block(block_import_params, Default::default()).await {
tracing::warn!(
target: "cumulus-consensus",
target: LOG_TARGET,
block_hash = ?hash,
error = ?err,
"Failed to set new best block.",