Limit number of blocks per level (2nd attempt) (#1559)

Prevents the StateDbError::TooManySiblingBlocks error from being triggered by eagerly removing stale blocks from the backend on block import and before the error condition is met. Introduces a just in time block recovery mechanism for blocks that were wrongly removed via an explicit pov-recovery method Co-authored-by: Bastian Köcher <bkchr@users.noreply.github.com>
2026-06-14 15:41:02 +00:00 · 2022-12-20 12:13:49 +01:00
parent 79d8c5c3b8
commit 030ba80ba0
17 changed files with 1096 additions and 184 deletions
@@ -20,11 +20,13 @@

 use cumulus_client_cli::CollatorOptions;
 use cumulus_client_consensus_common::ParachainConsensus;
+use cumulus_client_pov_recovery::{PoVRecovery, RecoveryDelay};
 use cumulus_primitives_core::{CollectCollationInfo, ParaId};
 use cumulus_relay_chain_inprocess_interface::build_inprocess_relay_chain;
 use cumulus_relay_chain_interface::{RelayChainInterface, RelayChainResult};
 use cumulus_relay_chain_minimal_node::build_minimal_relay_chain_node;
 use polkadot_primitives::v2::CollatorPair;
+
 use sc_client_api::{
 	Backend as BackendT, BlockBackend, BlockchainEvents, Finalizer, UsageProvider,
 };
@@ -35,8 +37,15 @@ use sp_api::ProvideRuntimeApi;
 use sp_blockchain::HeaderBackend;
 use sp_core::traits::SpawnNamed;
 use sp_runtime::traits::Block as BlockT;
+
+use futures::channel::mpsc;
 use std::{sync::Arc, time::Duration};

+// Given the sporadic nature of the explicit recovery operation and the
+// possibility to retry infinite times this value is more than enough.
+// In practice here we expect no more than one queued messages.
+const RECOVERY_CHAN_SIZE: usize = 8;
+
 /// Parameters given to [`start_collator`].
 pub struct StartCollatorParams<'a, Block: BlockT, BS, Client, RCInterface, Spawner> {
 	pub block_status: Arc<BS>,
@@ -90,11 +99,14 @@ where
 	RCInterface: RelayChainInterface + Clone + 'static,
 	Backend: BackendT<Block> + 'static,
 {
+	let (recovery_chan_tx, recovery_chan_rx) = mpsc::channel(RECOVERY_CHAN_SIZE);
+
 	let consensus = cumulus_client_consensus_common::run_parachain_consensus(
 		para_id,
 		client.clone(),
 		relay_chain_interface.clone(),
 		announce_block.clone(),
+		Some(recovery_chan_tx),
 	);

 	task_manager
@@ -105,15 +117,16 @@ where
 		.overseer_handle()
 		.map_err(|e| sc_service::Error::Application(Box::new(e)))?;

-	let pov_recovery = cumulus_client_pov_recovery::PoVRecovery::new(
+	let pov_recovery = PoVRecovery::new(
 		overseer_handle.clone(),
 		// We want that collators wait at maximum the relay chain slot duration before starting
 		// to recover blocks.
-		cumulus_client_pov_recovery::RecoveryDelay::WithMax { max: relay_chain_slot_duration },
+		RecoveryDelay { min: core::time::Duration::ZERO, max: relay_chain_slot_duration },
 		client.clone(),
 		import_queue,
 		relay_chain_interface.clone(),
 		para_id,
+		recovery_chan_rx,
 	);

 	task_manager
@@ -173,11 +186,14 @@ where
 	Backend: BackendT<Block> + 'static,
 	RCInterface: RelayChainInterface + Clone + 'static,
 {
+	let (recovery_chan_tx, recovery_chan_rx) = mpsc::channel(RECOVERY_CHAN_SIZE);
+
 	let consensus = cumulus_client_consensus_common::run_parachain_consensus(
 		para_id,
 		client.clone(),
 		relay_chain_interface.clone(),
 		announce_block,
+		Some(recovery_chan_tx),
 	);

 	task_manager
@@ -188,21 +204,19 @@ where
 		.overseer_handle()
 		.map_err(|e| sc_service::Error::Application(Box::new(e)))?;

-	let pov_recovery = cumulus_client_pov_recovery::PoVRecovery::new(
+	let pov_recovery = PoVRecovery::new(
 		overseer_handle,
 		// Full nodes should at least wait 2.5 minutes (assuming 6 seconds slot duration) and
 		// in maximum 5 minutes before starting to recover blocks. Collators should already start
 		// the recovery way before full nodes try to recover a certain block and then share the
 		// block with the network using "the normal way". Full nodes are just the "last resort"
 		// for block recovery.
-		cumulus_client_pov_recovery::RecoveryDelay::WithMinAndMax {
-			min: relay_chain_slot_duration * 25,
-			max: relay_chain_slot_duration * 50,
-		},
+		RecoveryDelay { min: relay_chain_slot_duration * 25, max: relay_chain_slot_duration * 50 },
 		client,
 		import_queue,
 		relay_chain_interface,
 		para_id,
+		recovery_chan_rx,
 	);

 	task_manager