Add retry mechanism for pov-recovery, fix full-node pov-recovery (#2164)

* Increase delay for pov-recovery * Update client/service/src/lib.rs Co-authored-by: Bastian Köcher <git@kchr.de> * Comment * FMT * Clear waiting_recovery when block is recovered or recovery failed * Introduce recovery queue that preserved insertion order * Better error logs * Decrease slot duration * Style improvements * Add option to use unordered queue * Maintain cache of finalized blocks * Wait for one relay chain slot before recovery * Make retries testable * fmt * Improve docs * Improve docs * Simplify RecoveryQueue * Remove unwanted changes * Adjust to comments * Apply suggestions from code review Co-authored-by: Bastian Köcher <git@kchr.de> * Move recovery delay into the queue * Check for finalized number * Clean up * Use timer Co-authored-by: Bastian Köcher <git@kchr.de> * Simplify implementation * Revert "Use timer" This reverts commit 3809eed840d3a09d54212f99486782ff80cdc1c9. * Properly clear `to_recover` flag --------- Co-authored-by: Bastian Köcher <git@kchr.de>
2026-06-18 15:21:05 +00:00 · 2023-02-09 14:18:55 +01:00
parent b4d0992ca8
commit afcfd2404a
15 changed files with 419 additions and 161 deletions
@@ -20,7 +20,7 @@

 use cumulus_client_cli::CollatorOptions;
 use cumulus_client_consensus_common::ParachainConsensus;
-use cumulus_client_pov_recovery::{PoVRecovery, RecoveryDelay};
+use cumulus_client_pov_recovery::{PoVRecovery, RecoveryDelayRange, RecoveryHandle};
 use cumulus_primitives_core::{CollectCollationInfo, ParaId};
 use cumulus_relay_chain_inprocess_interface::build_inprocess_relay_chain;
 use cumulus_relay_chain_interface::{RelayChainInterface, RelayChainResult};
@@ -59,6 +59,7 @@ pub struct StartCollatorParams<'a, Block: BlockT, BS, Client, RCInterface, Spawn
 	pub import_queue: Box<dyn ImportQueueService<Block>>,
 	pub collator_key: CollatorPair,
 	pub relay_chain_slot_duration: Duration,
+	pub recovery_handle: Box<dyn RecoveryHandle>,
 }

 /// Start a collator node for a parachain.
@@ -79,6 +80,7 @@ pub async fn start_collator<'a, Block, BS, Client, Backend, RCInterface, Spawner
 		import_queue,
 		collator_key,
 		relay_chain_slot_duration,
+		recovery_handle,
 	}: StartCollatorParams<'a, Block, BS, Client, RCInterface, Spawner>,
 ) -> sc_service::error::Result<()>
 where
@@ -113,15 +115,12 @@ where
 		.spawn_essential_handle()
 		.spawn("cumulus-consensus", None, consensus);

-	let overseer_handle = relay_chain_interface
-		.overseer_handle()
-		.map_err(|e| sc_service::Error::Application(Box::new(e)))?;
-
 	let pov_recovery = PoVRecovery::new(
-		overseer_handle.clone(),
+		recovery_handle,
 		// We want that collators wait at maximum the relay chain slot duration before starting
-		// to recover blocks.
-		RecoveryDelay { min: core::time::Duration::ZERO, max: relay_chain_slot_duration },
+		// to recover blocks. Additionally, we wait at least half the slot time to give the
+		// relay chain the chance to increase availability.
+		RecoveryDelayRange { min: relay_chain_slot_duration / 2, max: relay_chain_slot_duration },
 		client.clone(),
 		import_queue,
 		relay_chain_interface.clone(),
@@ -132,6 +131,10 @@ where
 	task_manager
 		.spawn_essential_handle()
 		.spawn("cumulus-pov-recovery", None, pov_recovery.run());
+
+	let overseer_handle = relay_chain_interface
+		.overseer_handle()
+		.map_err(|e| sc_service::Error::Application(Box::new(e)))?;
 	cumulus_client_collator::start_collator(cumulus_client_collator::StartCollatorParams {
 		runtime_api: client,
 		block_status,
@@ -156,6 +159,7 @@ pub struct StartFullNodeParams<'a, Block: BlockT, Client, RCInterface> {
 	pub announce_block: Arc<dyn Fn(Block::Hash, Option<Vec<u8>>) + Send + Sync>,
 	pub relay_chain_slot_duration: Duration,
 	pub import_queue: Box<dyn ImportQueueService<Block>>,
+	pub recovery_handle: Box<dyn RecoveryHandle>,
 }

 /// Start a full node for a parachain.
@@ -171,6 +175,7 @@ pub fn start_full_node<Block, Client, Backend, RCInterface>(
 		para_id,
 		relay_chain_slot_duration,
 		import_queue,
+		recovery_handle,
 	}: StartFullNodeParams<Block, Client, RCInterface>,
 ) -> sc_service::error::Result<()>
 where
@@ -200,18 +205,17 @@ where
 		.spawn_essential_handle()
 		.spawn("cumulus-consensus", None, consensus);

-	let overseer_handle = relay_chain_interface
-		.overseer_handle()
-		.map_err(|e| sc_service::Error::Application(Box::new(e)))?;
-
 	let pov_recovery = PoVRecovery::new(
-		overseer_handle,
+		recovery_handle,
 		// Full nodes should at least wait 2.5 minutes (assuming 6 seconds slot duration) and
 		// in maximum 5 minutes before starting to recover blocks. Collators should already start
 		// the recovery way before full nodes try to recover a certain block and then share the
 		// block with the network using "the normal way". Full nodes are just the "last resort"
 		// for block recovery.
-		RecoveryDelay { min: relay_chain_slot_duration * 25, max: relay_chain_slot_duration * 50 },
+		RecoveryDelayRange {
+			min: relay_chain_slot_duration * 25,
+			max: relay_chain_slot_duration * 50,
+		},
 		client,
 		import_queue,
 		relay_chain_interface,