Add retry mechanism for pov-recovery, fix full-node pov-recovery (#2164)

* Increase delay for pov-recovery * Update client/service/src/lib.rs Co-authored-by: Bastian Köcher <git@kchr.de> * Comment * FMT * Clear waiting_recovery when block is recovered or recovery failed * Introduce recovery queue that preserved insertion order * Better error logs * Decrease slot duration * Style improvements * Add option to use unordered queue * Maintain cache of finalized blocks * Wait for one relay chain slot before recovery * Make retries testable * fmt * Improve docs * Improve docs * Simplify RecoveryQueue * Remove unwanted changes * Adjust to comments * Apply suggestions from code review Co-authored-by: Bastian Köcher <git@kchr.de> * Move recovery delay into the queue * Check for finalized number * Clean up * Use timer Co-authored-by: Bastian Köcher <git@kchr.de> * Simplify implementation * Revert "Use timer" This reverts commit 3809eed840d3a09d54212f99486782ff80cdc1c9. * Properly clear `to_recover` flag --------- Co-authored-by: Bastian Köcher <git@kchr.de>
2026-05-01 08:57:56 +00:00 · 2023-02-09 14:18:55 +01:00
parent b4d0992ca8
commit afcfd2404a
15 changed files with 419 additions and 161 deletions
@@ -372,9 +372,9 @@ dependencies = [

 [[package]]
 name = "async-trait"
-version = "0.1.63"
+version = "0.1.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eff18d764974428cf3a9328e23fc5c986f5fbed46e6cd4cdf42544df5d297ec1"
+checksum = "1cd7fce9ba8c3c042128ce72d8b2ddbf3a05747efb67ea0313c635e10bda47a2"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -1863,6 +1863,7 @@ dependencies = [
 "polkadot-primitives",
 "sc-client-api",
 "sc-consensus",
+ "schnellru",
 "sp-blockchain",
 "sp-consensus",
 "sp-runtime",
@@ -1931,6 +1932,7 @@ dependencies = [
 name = "cumulus-client-pov-recovery"
 version = "0.1.0"
 dependencies = [
+ "async-trait",
 "cumulus-primitives-core",
 "cumulus-relay-chain-interface",
 "cumulus-test-service",
@@ -2421,6 +2423,7 @@ dependencies = [
 "cumulus-client-consensus-common",
 "cumulus-client-consensus-relay-chain",
 "cumulus-client-network",
+ "cumulus-client-pov-recovery",
 "cumulus-client-service",
 "cumulus-primitives-core",
 "cumulus-primitives-parachain-inherent",
@@ -2438,6 +2441,8 @@ dependencies = [
 "parachains-common",
 "parity-scale-codec",
 "polkadot-cli",
+ "polkadot-node-subsystem",
+ "polkadot-overseer",
 "polkadot-primitives",
 "polkadot-service",
 "polkadot-test-service",
@@ -28,6 +28,7 @@ polkadot-primitives = { git = "https://github.com/paritytech/polkadot", branch =
 cumulus-primitives-core = { path = "../../../primitives/core" }
 cumulus-relay-chain-interface = { path = "../../relay-chain-interface" }
 cumulus-client-pov-recovery = { path = "../../pov-recovery" }
+schnellru = "0.2.1"

 [dev-dependencies]
 futures-timer = "3.0.2"
@@ -18,11 +18,12 @@ use sc_client_api::{
 	Backend, BlockBackend, BlockImportNotification, BlockchainEvents, Finalizer, UsageProvider,
 };
 use sc_consensus::{BlockImport, BlockImportParams, ForkChoiceStrategy};
+use schnellru::{ByLength, LruMap};
 use sp_blockchain::Error as ClientError;
 use sp_consensus::{BlockOrigin, BlockStatus};
 use sp_runtime::traits::{Block as BlockT, Header as HeaderT};

-use cumulus_client_pov_recovery::{RecoveryDelay, RecoveryKind, RecoveryRequest};
+use cumulus_client_pov_recovery::{RecoveryKind, RecoveryRequest};
 use cumulus_relay_chain_interface::{RelayChainInterface, RelayChainResult};

 use polkadot_primitives::{Hash as PHash, Id as ParaId, OccupiedCoreAssumption};
@@ -30,16 +31,60 @@ use polkadot_primitives::{Hash as PHash, Id as ParaId, OccupiedCoreAssumption};
 use codec::Decode;
 use futures::{channel::mpsc::Sender, pin_mut, select, FutureExt, Stream, StreamExt};

-use std::{sync::Arc, time::Duration};
+use std::sync::Arc;

 const LOG_TARGET: &str = "cumulus-consensus";
+const FINALIZATION_CACHE_SIZE: u32 = 40;

-// Delay range to trigger explicit requests.
-// The chosen value doesn't have any special meaning, a random delay within the order of
-// seconds in practice should be a good enough to allow a quick recovery without DOSing
-// the relay chain.
-const RECOVERY_DELAY: RecoveryDelay =
-	RecoveryDelay { min: Duration::ZERO, max: Duration::from_secs(30) };
+fn handle_new_finalized_head<P, Block, B>(
+	parachain: &Arc<P>,
+	finalized_head: Vec<u8>,
+	last_seen_finalized_hashes: &mut LruMap<Block::Hash, ()>,
+) where
+	Block: BlockT,
+	B: Backend<Block>,
+	P: Finalizer<Block, B> + UsageProvider<Block> + BlockchainEvents<Block>,
+{
+	let header = match Block::Header::decode(&mut &finalized_head[..]) {
+		Ok(header) => header,
+		Err(err) => {
+			tracing::debug!(
+				target: LOG_TARGET,
+				error = ?err,
+				"Could not decode parachain header while following finalized heads.",
+			);
+			return
+		},
+	};
+
+	let hash = header.hash();
+
+	last_seen_finalized_hashes.insert(hash, ());
+
+	// Only finalize if we are below the incoming finalized parachain head
+	if parachain.usage_info().chain.finalized_number < *header.number() {
+		tracing::debug!(
+			target: LOG_TARGET,
+			block_hash = ?hash,
+			"Attempting to finalize header.",
+		);
+		if let Err(e) = parachain.finalize_block(hash, None, true) {
+			match e {
+				ClientError::UnknownBlock(_) => tracing::debug!(
+					target: LOG_TARGET,
+					block_hash = ?hash,
+					"Could not finalize block because it is unknown.",
+				),
+				_ => tracing::warn!(
+					target: LOG_TARGET,
+					error = ?e,
+					block_hash = ?hash,
+					"Failed to finalize block",
+				),
+			}
+		}
+	}
+}

 /// Follow the finalized head of the given parachain.
 ///
@@ -48,57 +93,75 @@ const RECOVERY_DELAY: RecoveryDelay =
 async fn follow_finalized_head<P, Block, B, R>(para_id: ParaId, parachain: Arc<P>, relay_chain: R)
 where
 	Block: BlockT,
-	P: Finalizer<Block, B> + UsageProvider<Block>,
+	P: Finalizer<Block, B> + UsageProvider<Block> + BlockchainEvents<Block>,
 	R: RelayChainInterface + Clone,
 	B: Backend<Block>,
 {
 	let finalized_heads = match finalized_heads(relay_chain, para_id).await {
-		Ok(finalized_heads_stream) => finalized_heads_stream,
+		Ok(finalized_heads_stream) => finalized_heads_stream.fuse(),
 		Err(err) => {
 			tracing::error!(target: LOG_TARGET, error = ?err, "Unable to retrieve finalized heads stream.");
 			return
 		},
 	};

+	let mut imported_blocks = parachain.import_notification_stream().fuse();
+
 	pin_mut!(finalized_heads);

+	// We use this cache to finalize blocks that are imported late.
+	// For example, a block that has been recovered via PoV-Recovery
+	// on a full node can have several minutes delay. With this cache
+	// we have some "memory" of recently finalized blocks.
+	let mut last_seen_finalized_hashes = LruMap::new(ByLength::new(FINALIZATION_CACHE_SIZE));
+
 	loop {
-		let finalized_head = if let Some(h) = finalized_heads.next().await {
-			h
-		} else {
-			tracing::debug!(target: LOG_TARGET, "Stopping following finalized head.");
-			return
-		};
-
-		let header = match Block::Header::decode(&mut &finalized_head[..]) {
-			Ok(header) => header,
-			Err(err) => {
-				tracing::debug!(
-					target: LOG_TARGET,
-					error = ?err,
-					"Could not decode parachain header while following finalized heads.",
-				);
-				continue
+		select! {
+			fin = finalized_heads.next() => {
+				match fin {
+					Some(finalized_head) =>
+						handle_new_finalized_head(&parachain, finalized_head, &mut last_seen_finalized_hashes),
+					None => {
+						tracing::debug!(target: LOG_TARGET, "Stopping following finalized head.");
+						return
+					}
+				}
 			},
-		};
+			imported = imported_blocks.next() => {
+				match imported {
+					Some(imported_block) => {
+						// When we see a block import that is already finalized, we immediately finalize it.
+						if last_seen_finalized_hashes.peek(&imported_block.hash).is_some() {
+							tracing::debug!(
+								target: LOG_TARGET,
+								block_hash = ?imported_block.hash,
+								"Setting newly imported block as finalized.",
+							);

-		let hash = header.hash();
-
-		// don't finalize the same block multiple times.
-		if parachain.usage_info().chain.finalized_hash != hash {
-			if let Err(e) = parachain.finalize_block(hash, None, true) {
-				match e {
-					ClientError::UnknownBlock(_) => tracing::debug!(
-						target: LOG_TARGET,
-						block_hash = ?hash,
-						"Could not finalize block because it is unknown.",
-					),
-					_ => tracing::warn!(
-						target: LOG_TARGET,
-						error = ?e,
-						block_hash = ?hash,
-						"Failed to finalize block",
-					),
+							if let Err(e) = parachain.finalize_block(imported_block.hash, None, true) {
+								match e {
+									ClientError::UnknownBlock(_) => tracing::debug!(
+										target: LOG_TARGET,
+										block_hash = ?imported_block.hash,
+										"Could not finalize block because it is unknown.",
+									),
+									_ => tracing::warn!(
+										target: LOG_TARGET,
+										error = ?e,
+										block_hash = ?imported_block.hash,
+										"Failed to finalize block",
+									),
+								}
+							}
+						}
+					},
+					None => {
+						tracing::debug!(
+							target: LOG_TARGET,
+							"Stopping following imported blocks.",
+						);
+						return
+					}
 				}
 			}
 		}
@@ -266,7 +329,11 @@ async fn handle_new_block_imported<Block, P>(
 			let unset_best_header = unset_best_header_opt
 				.take()
 				.expect("We checked above that the value is set; qed");
-
+			tracing::debug!(
+				target: LOG_TARGET,
+				?unset_hash,
+				"Importing block as new best for parachain.",
+			);
 			import_block_as_new_best(unset_hash, unset_best_header, parachain).await;
 		},
 		state => tracing::debug!(
@@ -315,7 +382,11 @@ async fn handle_new_best_parachain_head<Block, P>(
 		match parachain.block_status(hash) {
 			Ok(BlockStatus::InChainWithState) => {
 				unset_best_header.take();
-
+				tracing::debug!(
+					target: LOG_TARGET,
+					?hash,
+					"Importing block as new best for parachain.",
+				);
 				import_block_as_new_best(hash, parachain_head, parachain).await;
 			},
 			Ok(BlockStatus::InChainPruned) => {
@@ -338,8 +409,7 @@ async fn handle_new_best_parachain_head<Block, P>(
 					// Best effort channel to actively encourage block recovery.
 					// An error here is not fatal; the relay chain continuously re-announces
 					// the best block, thus we will have other opportunities to retry.
-					let req =
-						RecoveryRequest { hash, delay: RECOVERY_DELAY, kind: RecoveryKind::Full };
+					let req = RecoveryRequest { hash, kind: RecoveryKind::Full };
 					if let Err(err) = recovery_chan_tx.try_send(req) {
 						tracing::warn!(
 							target: LOG_TARGET,
@@ -28,6 +28,7 @@ polkadot-primitives = { git = "https://github.com/paritytech/polkadot", branch =
 # Cumulus
 cumulus-primitives-core = { path = "../../primitives/core" }
 cumulus-relay-chain-interface = {path = "../relay-chain-interface"}
+async-trait = "0.1.64"

 [dev-dependencies]
 tokio = { version = "1.25.0", features = ["macros"] }
@@ -18,12 +18,13 @@ use sp_runtime::traits::Block as BlockT;

 use polkadot_node_primitives::AvailableData;
 use polkadot_node_subsystem::messages::AvailabilityRecoveryMessage;
-use polkadot_overseer::Handle as OverseerHandle;

 use futures::{channel::oneshot, stream::FuturesUnordered, Future, FutureExt, StreamExt};

 use std::{collections::HashSet, pin::Pin};

+use crate::RecoveryHandle;
+
 /// The active candidate recovery.
 ///
 /// This handles the candidate recovery and tracks the activate recoveries.
@@ -34,12 +35,12 @@ pub(crate) struct ActiveCandidateRecovery<Block: BlockT> {
 	>,
 	/// The block hashes of the candidates currently being recovered.
 	candidates: HashSet<Block::Hash>,
-	overseer_handle: OverseerHandle,
+	recovery_handle: Box<dyn RecoveryHandle>,
 }

 impl<Block: BlockT> ActiveCandidateRecovery<Block> {
-	pub fn new(overseer_handle: OverseerHandle) -> Self {
-		Self { recoveries: Default::default(), candidates: Default::default(), overseer_handle }
+	pub fn new(recovery_handle: Box<dyn RecoveryHandle>) -> Self {
+		Self { recoveries: Default::default(), candidates: Default::default(), recovery_handle }
 	}

 	/// Recover the given `candidate`.
@@ -50,8 +51,8 @@ impl<Block: BlockT> ActiveCandidateRecovery<Block> {
 	) {
 		let (tx, rx) = oneshot::channel();

-		self.overseer_handle
-			.send_msg(
+		self.recovery_handle
+			.send_recovery_msg(
 				AvailabilityRecoveryMessage::RecoverAvailableData(
 					candidate.receipt.clone(),
 					candidate.session_index,
@@ -90,11 +91,6 @@ impl<Block: BlockT> ActiveCandidateRecovery<Block> {
 		);
 	}

-	/// Returns if the given `candidate` is being recovered.
-	pub fn is_being_recovered(&self, candidate: &Block::Hash) -> bool {
-		self.candidates.contains(candidate)
-	}
-
 	/// Waits for the next recovery.
 	///
 	/// If the returned [`AvailableData`] is `None`, it means that the recovery failed.
@@ -29,14 +29,18 @@
 //!
 //! 1. For every included relay chain block we note the backed candidate of our parachain. If the
 //!    block belonging to the PoV is already known, we do nothing. Otherwise we start
-//!    a timer that waits a random time between 0..relay_chain_slot_length before starting to recover
+//!    a timer that waits for a randomized time inside a specified interval before starting to recover
 //!    the PoV.
 //!
 //! 2. If between starting and firing the timer the block is imported, we skip the recovery of the
 //!    PoV.
 //!
-//! 3. If the timer fired we recover the PoV using the relay chain PoV recovery protocol. After it
-//!    is recovered, we restore the block and import it.
+//! 3. If the timer fired we recover the PoV using the relay chain PoV recovery protocol.
+//!
+//! 4a. After it is recovered, we restore the block and import it.
+//!
+//! 4b. Since we are trying to recover pending candidates, availability is not guaranteed. If the block
+//! 	PoV is not yet available, we retry.
 //!
 //! If we need to recover multiple PoV blocks (which should hopefully not happen in real life), we
 //! make sure that the blocks are imported in the correct order.
@@ -47,6 +51,7 @@ use sp_consensus::{BlockOrigin, BlockStatus};
 use sp_runtime::traits::{Block as BlockT, Header as HeaderT, NumberFor};

 use polkadot_node_primitives::{AvailableData, POV_BOMB_LIMIT};
+use polkadot_node_subsystem::messages::AvailabilityRecoveryMessage;
 use polkadot_overseer::Handle as OverseerHandle;
 use polkadot_primitives::{
 	CandidateReceipt, CommittedCandidateReceipt, Id as ParaId, SessionIndex,
@@ -60,10 +65,10 @@ use futures::{
 	channel::mpsc::Receiver, select, stream::FuturesUnordered, Future, FutureExt, Stream, StreamExt,
 };
 use futures_timer::Delay;
-use rand::{thread_rng, Rng};
+use rand::{distributions::Uniform, prelude::Distribution, thread_rng};

 use std::{
-	collections::{HashMap, VecDeque},
+	collections::{HashMap, HashSet, VecDeque},
 	pin::Pin,
 	sync::Arc,
 	time::Duration,
@@ -74,6 +79,28 @@ use active_candidate_recovery::ActiveCandidateRecovery;

 const LOG_TARGET: &str = "cumulus-pov-recovery";

+/// Test-friendly wrapper trait for the overseer handle.
+/// Can be used to simulate failing recovery requests.
+#[async_trait::async_trait]
+pub trait RecoveryHandle: Send {
+	async fn send_recovery_msg(
+		&mut self,
+		message: AvailabilityRecoveryMessage,
+		origin: &'static str,
+	);
+}
+
+#[async_trait::async_trait]
+impl RecoveryHandle for OverseerHandle {
+	async fn send_recovery_msg(
+		&mut self,
+		message: AvailabilityRecoveryMessage,
+		origin: &'static str,
+	) {
+		self.send_msg(message, origin).await;
+	}
+}
+
 /// Type of recovery to trigger.
 #[derive(Debug, PartialEq)]
 pub enum RecoveryKind {
@@ -87,24 +114,30 @@ pub enum RecoveryKind {
 pub struct RecoveryRequest<Block: BlockT> {
 	/// Hash of the last block to recover.
 	pub hash: Block::Hash,
-	/// Recovery delay range. Randomizing the start of the recovery within this interval
-	/// can be used to prevent self-DOSing if the recovery request is part of a
-	/// distributed protocol and there is the possibility that multiple actors are
-	/// requiring to perform the recovery action at approximately the same time.
-	pub delay: RecoveryDelay,
 	/// Recovery type.
 	pub kind: RecoveryKind,
 }

 /// The delay between observing an unknown block and triggering the recovery of a block.
+/// Randomizing the start of the recovery within this interval
+/// can be used to prevent self-DOSing if the recovery request is part of a
+/// distributed protocol and there is the possibility that multiple actors are
+/// requiring to perform the recovery action at approximately the same time.
 #[derive(Clone, Copy)]
-pub struct RecoveryDelay {
+pub struct RecoveryDelayRange {
 	/// Start recovering after `min` delay.
 	pub min: Duration,
 	/// Start recovering before `max` delay.
 	pub max: Duration,
 }

+impl RecoveryDelayRange {
+	/// Produce a randomized duration between `min` and `max`.
+	fn duration(&self) -> Duration {
+		Uniform::from(self.min..=self.max).sample(&mut thread_rng())
+	}
+}
+
 /// Represents an outstanding block candidate.
 struct Candidate<Block: BlockT> {
 	receipt: CandidateReceipt,
@@ -112,9 +145,66 @@ struct Candidate<Block: BlockT> {
 	block_number: NumberFor<Block>,
 	parent_hash: Block::Hash,
 	// Lazy recovery has been submitted.
+	// Should be true iff a block is either queued to be recovered or
+	// recovery is currently in progress.
 	waiting_recovery: bool,
 }

+/// Queue that is used to decide when to start PoV-recovery operations.
+struct RecoveryQueue<Block: BlockT> {
+	recovery_delay_range: RecoveryDelayRange,
+	// Queue that keeps the hashes of blocks to be recovered.
+	recovery_queue: VecDeque<Block::Hash>,
+	// Futures that resolve when a new recovery should be started.
+	signaling_queue: FuturesUnordered<Pin<Box<dyn Future<Output = ()> + Send>>>,
+}
+
+impl<Block: BlockT> RecoveryQueue<Block> {
+	pub fn new(recovery_delay_range: RecoveryDelayRange) -> Self {
+		Self {
+			recovery_delay_range,
+			recovery_queue: Default::default(),
+			signaling_queue: Default::default(),
+		}
+	}
+
+	/// Add hash of a block that should go to the end of the recovery queue.
+	/// A new recovery will be signaled after `delay` has passed.
+	pub fn push_recovery(&mut self, hash: Block::Hash) {
+		let delay = self.recovery_delay_range.duration();
+		tracing::debug!(
+			target: LOG_TARGET,
+			block_hash = ?hash,
+			"Adding block to queue and adding new recovery slot in {:?} sec",
+			delay.as_secs(),
+		);
+		self.recovery_queue.push_back(hash);
+		self.signaling_queue.push(
+			async move {
+				Delay::new(delay).await;
+			}
+			.boxed(),
+		);
+	}
+
+	/// Get the next hash for block recovery.
+	pub async fn next_recovery(&mut self) -> Block::Hash {
+		loop {
+			if let Some(_) = self.signaling_queue.next().await {
+				if let Some(hash) = self.recovery_queue.pop_front() {
+					return hash
+				} else {
+					tracing::error!(
+						target: LOG_TARGET,
+						"Recovery was signaled, but no candidate hash available. This is a bug."
+					);
+				};
+			}
+			futures::pending!()
+		}
+	}
+}
+
 /// Encapsulates the logic of the pov recovery.
 pub struct PoVRecovery<Block: BlockT, PC, RC> {
 	/// All the pending candidates that we are waiting for to be imported or that need to be
@@ -122,21 +212,22 @@ pub struct PoVRecovery<Block: BlockT, PC, RC> {
 	candidates: HashMap<Block::Hash, Candidate<Block>>,
 	/// A stream of futures that resolve to hashes of candidates that need to be recovered.
 	///
-	/// The candidates to the hashes are stored in `pending_candidates`. If a candidate is not
+	/// The candidates to the hashes are stored in `candidates`. If a candidate is not
 	/// available anymore in this map, it means that it was already imported.
-	next_candidate_to_recover: FuturesUnordered<Pin<Box<dyn Future<Output = Block::Hash> + Send>>>,
+	candidate_recovery_queue: RecoveryQueue<Block>,
 	active_candidate_recovery: ActiveCandidateRecovery<Block>,
 	/// Blocks that wait that the parent is imported.
 	///
 	/// Uses parent -> blocks mapping.
 	waiting_for_parent: HashMap<Block::Hash, Vec<Block>>,
-	recovery_delay: RecoveryDelay,
 	parachain_client: Arc<PC>,
 	parachain_import_queue: Box<dyn ImportQueueService<Block>>,
 	relay_chain_interface: RC,
 	para_id: ParaId,
 	/// Explicit block recovery requests channel.
 	recovery_chan_rx: Receiver<RecoveryRequest<Block>>,
+	/// Blocks that we are retrying currently
+	candidates_in_retry: HashSet<Block::Hash>,
 }

 impl<Block: BlockT, PC, RCInterface> PoVRecovery<Block, PC, RCInterface>
@@ -146,8 +237,8 @@ where
 {
 	/// Create a new instance.
 	pub fn new(
-		overseer_handle: OverseerHandle,
-		recovery_delay: RecoveryDelay,
+		recovery_handle: Box<dyn RecoveryHandle>,
+		recovery_delay_range: RecoveryDelayRange,
 		parachain_client: Arc<PC>,
 		parachain_import_queue: Box<dyn ImportQueueService<Block>>,
 		relay_chain_interface: RCInterface,
@@ -156,14 +247,14 @@ where
 	) -> Self {
 		Self {
 			candidates: HashMap::new(),
-			next_candidate_to_recover: Default::default(),
-			active_candidate_recovery: ActiveCandidateRecovery::new(overseer_handle),
-			recovery_delay,
+			candidate_recovery_queue: RecoveryQueue::new(recovery_delay_range),
+			active_candidate_recovery: ActiveCandidateRecovery::new(recovery_handle),
 			waiting_for_parent: HashMap::new(),
 			parachain_client,
 			parachain_import_queue,
 			relay_chain_interface,
 			para_id,
+			candidates_in_retry: HashSet::new(),
 			recovery_chan_rx,
 		}
 	}
@@ -210,15 +301,11 @@ where

 		// If required, triggers a lazy recovery request that will eventually be blocked
 		// if in the meantime the block is imported.
-		self.recover(RecoveryRequest {
-			hash,
-			delay: self.recovery_delay,
-			kind: RecoveryKind::Simple,
-		});
+		self.recover(RecoveryRequest { hash, kind: RecoveryKind::Simple });
 	}

-	/// Handle an imported block.
-	fn handle_block_imported(&mut self, block_hash: &Block::Hash) {
+	/// Block is no longer waiting for recovery
+	fn clear_waiting_recovery(&mut self, block_hash: &Block::Hash) {
 		self.candidates.get_mut(block_hash).map(|candidate| {
 			// Prevents triggering an already enqueued recovery request
 			candidate.waiting_recovery = false;
@@ -241,9 +328,9 @@ where
 		}
 	}

-	/// Clear `waiting_for_parent` from the given `hash` and do this recursively for all child
-	/// blocks.
-	fn clear_waiting_for_parent(&mut self, hash: Block::Hash) {
+	/// Clear `waiting_for_parent` and `waiting_recovery` for the candidate with `hash`.
+	/// Also clears children blocks waiting for this parent.
+	fn reset_candidate(&mut self, hash: Block::Hash) {
 		let mut blocks_to_delete = vec![hash];

 		while let Some(delete) = blocks_to_delete.pop() {
@@ -251,6 +338,7 @@ where
 				blocks_to_delete.extend(childs.iter().map(BlockT::hash));
 			}
 		}
+		self.clear_waiting_recovery(&hash);
 	}

 	/// Handle a recovered candidate.
@@ -260,11 +348,25 @@ where
 		available_data: Option<AvailableData>,
 	) {
 		let available_data = match available_data {
-			Some(data) => data,
-			None => {
-				self.clear_waiting_for_parent(block_hash);
-				return
+			Some(data) => {
+				self.candidates_in_retry.remove(&block_hash);
+				data
 			},
+			None =>
+				if self.candidates_in_retry.insert(block_hash) {
+					tracing::debug!(target: LOG_TARGET, ?block_hash, "Recovery failed, retrying.");
+					self.candidate_recovery_queue.push_recovery(block_hash);
+					return
+				} else {
+					tracing::warn!(
+						target: LOG_TARGET,
+						?block_hash,
+						"Unable to recover block after retry.",
+					);
+					self.candidates_in_retry.remove(&block_hash);
+					self.reset_candidate(block_hash);
+					return
+				},
 		};

 		let raw_block_data = match sp_maybe_compressed_blob::decompress(
@@ -275,8 +377,7 @@ where
 			Err(error) => {
 				tracing::debug!(target: LOG_TARGET, ?error, "Failed to decompress PoV");

-				self.clear_waiting_for_parent(block_hash);
-
+				self.reset_candidate(block_hash);
 				return
 			},
 		};
@@ -290,8 +391,7 @@ where
 					"Failed to decode parachain block data from recovered PoV",
 				);

-				self.clear_waiting_for_parent(block_hash);
-
+				self.reset_candidate(block_hash);
 				return
 			},
 		};
@@ -302,12 +402,17 @@ where

 		match self.parachain_client.block_status(parent) {
 			Ok(BlockStatus::Unknown) => {
-				if self.active_candidate_recovery.is_being_recovered(&parent) {
+				// If the parent block is currently being recovered or is scheduled to be recovered,
+				// we want to wait for the parent.
+				let parent_scheduled_for_recovery =
+					self.candidates.get(&parent).map_or(false, |parent| parent.waiting_recovery);
+				if parent_scheduled_for_recovery {
 					tracing::debug!(
 						target: LOG_TARGET,
 						?block_hash,
 						parent_hash = ?parent,
-						"Parent is still being recovered, waiting.",
+						parent_scheduled_for_recovery,
+						"Waiting for recovery of parent.",
 					);

 					self.waiting_for_parent.entry(parent).or_default().push(block);
@@ -320,8 +425,7 @@ where
 						"Parent not found while trying to import recovered block.",
 					);

-					self.clear_waiting_for_parent(block_hash);
-
+					self.reset_candidate(block_hash);
 					return
 				}
 			},
@@ -333,8 +437,7 @@ where
 					"Error while checking block status",
 				);

-				self.clear_waiting_for_parent(block_hash);
-
+				self.reset_candidate(block_hash);
 				return
 			},
 			// Any other status is fine to "ignore/accept"
@@ -383,10 +486,10 @@ where

 	/// Attempts an explicit recovery of one or more blocks.
 	pub fn recover(&mut self, req: RecoveryRequest<Block>) {
-		let RecoveryRequest { mut hash, delay, kind } = req;
+		let RecoveryRequest { mut hash, kind } = req;
 		let mut to_recover = Vec::new();

-		let do_recover = loop {
+		loop {
 			let candidate = match self.candidates.get_mut(&hash) {
 				Some(candidate) => candidate,
 				None => {
@@ -395,7 +498,7 @@ where
 						block_hash = ?hash,
 						"Cound not recover. Block was never announced as candidate"
 					);
-					break false
+					return
 				},
 			};

@@ -404,7 +507,7 @@ where
 					candidate.waiting_recovery = true;
 					to_recover.push(hash);
 				},
-				Ok(_) => break true,
+				Ok(_) => break,
 				Err(e) => {
 					tracing::error!(
 						target: LOG_TARGET,
@@ -412,36 +515,22 @@ where
 						block_hash = ?hash,
 						"Failed to get block status",
 					);
-					break false
+					for hash in to_recover {
+						self.clear_waiting_recovery(&hash);
+					}
+					return
 				},
 			}

 			if kind == RecoveryKind::Simple {
-				break true
+				break
 			}

 			hash = candidate.parent_hash;
-		};
+		}

-		if do_recover {
-			for hash in to_recover.into_iter().rev() {
-				let delay =
-					delay.min + delay.max.saturating_sub(delay.min).mul_f64(thread_rng().gen());
-				tracing::debug!(
-					target: LOG_TARGET,
-					block_hash = ?hash,
-					"Starting {:?} block recovery in {:?} sec",
-					kind,
-					delay.as_secs(),
-				);
-				self.next_candidate_to_recover.push(
-					async move {
-						Delay::new(delay).await;
-						hash
-					}
-					.boxed(),
-				);
-			}
+		for hash in to_recover.into_iter().rev() {
+			self.candidate_recovery_queue.push_recovery(hash);
 		}
 	}

@@ -480,7 +569,7 @@ where
 				},
 				imported = imported_blocks.next() => {
 					if let Some(imported) = imported {
-						self.handle_block_imported(&imported.hash);
+						self.clear_waiting_recovery(&imported.hash);
 					} else {
 						tracing::debug!(target: LOG_TARGET,	"Imported blocks stream ended");
 						return;
@@ -494,10 +583,8 @@ where
 						return;
 					}
 				},
-				next_to_recover = self.next_candidate_to_recover.next() => {
-					if let Some(block_hash) = next_to_recover {
-						self.recover_candidate(block_hash).await;
-					}
+				next_to_recover = self.candidate_recovery_queue.next_recovery().fuse() => {
+						self.recover_candidate(next_to_recover).await;
 				},
 				(block_hash, available_data) =
 					self.active_candidate_recovery.wait_for_recovery().fuse() =>
@@ -20,7 +20,7 @@

 use cumulus_client_cli::CollatorOptions;
 use cumulus_client_consensus_common::ParachainConsensus;
-use cumulus_client_pov_recovery::{PoVRecovery, RecoveryDelay};
+use cumulus_client_pov_recovery::{PoVRecovery, RecoveryDelayRange, RecoveryHandle};
 use cumulus_primitives_core::{CollectCollationInfo, ParaId};
 use cumulus_relay_chain_inprocess_interface::build_inprocess_relay_chain;
 use cumulus_relay_chain_interface::{RelayChainInterface, RelayChainResult};
@@ -59,6 +59,7 @@ pub struct StartCollatorParams<'a, Block: BlockT, BS, Client, RCInterface, Spawn
 	pub import_queue: Box<dyn ImportQueueService<Block>>,
 	pub collator_key: CollatorPair,
 	pub relay_chain_slot_duration: Duration,
+	pub recovery_handle: Box<dyn RecoveryHandle>,
 }

 /// Start a collator node for a parachain.
@@ -79,6 +80,7 @@ pub async fn start_collator<'a, Block, BS, Client, Backend, RCInterface, Spawner
 		import_queue,
 		collator_key,
 		relay_chain_slot_duration,
+		recovery_handle,
 	}: StartCollatorParams<'a, Block, BS, Client, RCInterface, Spawner>,
 ) -> sc_service::error::Result<()>
 where
@@ -113,15 +115,12 @@ where
 		.spawn_essential_handle()
 		.spawn("cumulus-consensus", None, consensus);

-	let overseer_handle = relay_chain_interface
-		.overseer_handle()
-		.map_err(|e| sc_service::Error::Application(Box::new(e)))?;
-
 	let pov_recovery = PoVRecovery::new(
-		overseer_handle.clone(),
+		recovery_handle,
 		// We want that collators wait at maximum the relay chain slot duration before starting
-		// to recover blocks.
-		RecoveryDelay { min: core::time::Duration::ZERO, max: relay_chain_slot_duration },
+		// to recover blocks. Additionally, we wait at least half the slot time to give the
+		// relay chain the chance to increase availability.
+		RecoveryDelayRange { min: relay_chain_slot_duration / 2, max: relay_chain_slot_duration },
 		client.clone(),
 		import_queue,
 		relay_chain_interface.clone(),
@@ -132,6 +131,10 @@ where
 	task_manager
 		.spawn_essential_handle()
 		.spawn("cumulus-pov-recovery", None, pov_recovery.run());
+
+	let overseer_handle = relay_chain_interface
+		.overseer_handle()
+		.map_err(|e| sc_service::Error::Application(Box::new(e)))?;
 	cumulus_client_collator::start_collator(cumulus_client_collator::StartCollatorParams {
 		runtime_api: client,
 		block_status,
@@ -156,6 +159,7 @@ pub struct StartFullNodeParams<'a, Block: BlockT, Client, RCInterface> {
 	pub announce_block: Arc<dyn Fn(Block::Hash, Option<Vec<u8>>) + Send + Sync>,
 	pub relay_chain_slot_duration: Duration,
 	pub import_queue: Box<dyn ImportQueueService<Block>>,
+	pub recovery_handle: Box<dyn RecoveryHandle>,
 }

 /// Start a full node for a parachain.
@@ -171,6 +175,7 @@ pub fn start_full_node<Block, Client, Backend, RCInterface>(
 		para_id,
 		relay_chain_slot_duration,
 		import_queue,
+		recovery_handle,
 	}: StartFullNodeParams<Block, Client, RCInterface>,
 ) -> sc_service::error::Result<()>
 where
@@ -200,18 +205,17 @@ where
 		.spawn_essential_handle()
 		.spawn("cumulus-consensus", None, consensus);

-	let overseer_handle = relay_chain_interface
-		.overseer_handle()
-		.map_err(|e| sc_service::Error::Application(Box::new(e)))?;
-
 	let pov_recovery = PoVRecovery::new(
-		overseer_handle,
+		recovery_handle,
 		// Full nodes should at least wait 2.5 minutes (assuming 6 seconds slot duration) and
 		// in maximum 5 minutes before starting to recover blocks. Collators should already start
 		// the recovery way before full nodes try to recover a certain block and then share the
 		// block with the network using "the normal way". Full nodes are just the "last resort"
 		// for block recovery.
-		RecoveryDelay { min: relay_chain_slot_duration * 25, max: relay_chain_slot_duration * 50 },
+		RecoveryDelayRange {
+			min: relay_chain_slot_duration * 25,
+			max: relay_chain_slot_duration * 50,
+		},
 		client,
 		import_queue,
 		relay_chain_interface,
@@ -256,6 +256,10 @@ async fn start_node_impl(

 	let relay_chain_slot_duration = Duration::from_secs(6);

+	let overseer_handle = relay_chain_interface
+		.overseer_handle()
+		.map_err(|e| sc_service::Error::Application(Box::new(e)))?;
+
 	if validator {
 		let parachain_consensus = build_consensus(
 			client.clone(),
@@ -284,6 +288,7 @@ async fn start_node_impl(
 			import_queue: import_queue_service,
 			collator_key: collator_key.expect("Command line arguments do not allow this. qed"),
 			relay_chain_slot_duration,
+			recovery_handle: Box::new(overseer_handle),
 		};

 		start_collator(params).await?;
@@ -296,6 +301,7 @@ async fn start_node_impl(
 			relay_chain_interface,
 			relay_chain_slot_duration,
 			import_queue: import_queue_service,
+			recovery_handle: Box::new(overseer_handle),
 		};

 		start_full_node(params)?;
@@ -452,6 +452,10 @@ where

 	let relay_chain_slot_duration = Duration::from_secs(6);

+	let overseer_handle = relay_chain_interface
+		.overseer_handle()
+		.map_err(|e| sc_service::Error::Application(Box::new(e)))?;
+
 	if validator {
 		let parachain_consensus = build_consensus(
 			client.clone(),
@@ -480,6 +484,7 @@ where
 			import_queue: import_queue_service,
 			collator_key: collator_key.expect("Command line arguments do not allow this. qed"),
 			relay_chain_slot_duration,
+			recovery_handle: Box::new(overseer_handle),
 		};

 		start_collator(params).await?;
@@ -492,6 +497,7 @@ where
 			relay_chain_interface,
 			relay_chain_slot_duration,
 			import_queue: import_queue_service,
+			recovery_handle: Box::new(overseer_handle),
 		};

 		start_full_node(params)?;
@@ -652,6 +658,9 @@ where

 	let relay_chain_slot_duration = Duration::from_secs(6);

+	let overseer_handle = relay_chain_interface
+		.overseer_handle()
+		.map_err(|e| sc_service::Error::Application(Box::new(e)))?;
 	if validator {
 		let parachain_consensus = build_consensus(
 			client.clone(),
@@ -680,6 +689,7 @@ where
 			import_queue: import_queue_service,
 			collator_key: collator_key.expect("Command line arguments do not allow this. qed"),
 			relay_chain_slot_duration,
+			recovery_handle: Box::new(overseer_handle),
 		};

 		start_collator(params).await?;
@@ -692,6 +702,7 @@ where
 			relay_chain_interface,
 			relay_chain_slot_duration,
 			import_queue: import_queue_service,
+			recovery_handle: Box::new(overseer_handle),
 		};

 		start_full_node(params)?;
@@ -1425,6 +1436,9 @@ where

 	let relay_chain_slot_duration = Duration::from_secs(6);

+	let overseer_handle = relay_chain_interface
+		.overseer_handle()
+		.map_err(|e| sc_service::Error::Application(Box::new(e)))?;
 	if validator {
 		let parachain_consensus = build_consensus(
 			client.clone(),
@@ -1453,6 +1467,7 @@ where
 			import_queue: import_queue_service,
 			collator_key: collator_key.expect("Command line arguments do not allow this. qed"),
 			relay_chain_slot_duration,
+			recovery_handle: Box::new(overseer_handle),
 		};

 		start_collator(params).await?;
@@ -1465,6 +1480,7 @@ where
 			relay_chain_interface,
 			relay_chain_slot_duration,
 			import_queue: import_queue_service,
+			recovery_handle: Box::new(overseer_handle),
 		};

 		start_full_node(params)?;
@@ -54,6 +54,8 @@ polkadot-primitives = { git = "https://github.com/paritytech/polkadot", branch =
 polkadot-service = { git = "https://github.com/paritytech/polkadot", branch = "master" }
 polkadot-test-service = { git = "https://github.com/paritytech/polkadot", branch = "master" }
 polkadot-cli = { git = "https://github.com/paritytech/polkadot", branch = "master" }
+polkadot-node-subsystem = { git = "https://github.com/paritytech/polkadot", branch = "master" }
+polkadot-overseer = { git = "https://github.com/paritytech/polkadot", branch = "master" }

 # Cumulus
 cumulus-client-cli = { path = "../../client/cli" }
@@ -70,6 +72,7 @@ cumulus-relay-chain-rpc-interface = { path = "../../client/relay-chain-rpc-inter
 cumulus-test-relay-validation-worker-provider = { path = "../relay-validation-worker-provider" }
 cumulus-test-runtime = { path = "../runtime" }
 cumulus-relay-chain-minimal-node = { path = "../../client/relay-chain-minimal-node" }
+cumulus-client-pov-recovery = { path = "../../client/pov-recovery" }

 [dev-dependencies]
 futures = "0.3.26"
@@ -49,6 +49,9 @@ pub struct TestCollatorCli {

 	#[arg(long)]
 	pub disable_block_announcements: bool,
+
+	#[arg(long)]
+	pub fail_pov_recovery: bool,
 }

 #[derive(Debug, clap::Subcommand)]
@@ -34,6 +34,7 @@ use cumulus_client_consensus_common::{
 	ParachainBlockImport as TParachainBlockImport, ParachainCandidate, ParachainConsensus,
 };
 use cumulus_client_network::BlockAnnounceValidator;
+use cumulus_client_pov_recovery::RecoveryHandle;
 use cumulus_client_service::{
 	prepare_node_config, start_collator, start_full_node, StartCollatorParams, StartFullNodeParams,
 };
@@ -45,6 +46,8 @@ use cumulus_relay_chain_minimal_node::build_minimal_relay_chain_node;
 use cumulus_test_runtime::{Hash, Header, NodeBlock as Block, RuntimeApi};

 use frame_system_rpc_runtime_api::AccountNonceApi;
+use polkadot_node_subsystem::{errors::RecoveryError, messages::AvailabilityRecoveryMessage};
+use polkadot_overseer::Handle as OverseerHandle;
 use polkadot_primitives::{CollatorPair, Hash as PHash, PersistedValidationData};
 use polkadot_service::ProvideRuntimeApi;
 use sc_client_api::execution_extensions::ExecutionStrategies;
@@ -76,6 +79,8 @@ pub use cumulus_test_runtime as runtime;
 pub use genesis::*;
 pub use sp_keyring::Sr25519Keyring as Keyring;

+const LOG_TARGET: &str = "cumulus-test-service";
+
 /// A consensus that will never produce any block.
 #[derive(Clone)]
 struct NullConsensus;
@@ -126,6 +131,41 @@ pub type ParachainBlockImport = TParachainBlockImport<Block, Arc<Client>, Backen
 /// Transaction pool type used by the test service
 pub type TransactionPool = Arc<sc_transaction_pool::FullPool<Block, Client>>;

+/// Recovery handle that fails regularly to simulate unavailable povs.
+pub struct FailingRecoveryHandle {
+	overseer_handle: OverseerHandle,
+	counter: u32,
+}
+
+impl FailingRecoveryHandle {
+	/// Create a new FailingRecoveryHandle
+	pub fn new(overseer_handle: OverseerHandle) -> Self {
+		Self { overseer_handle, counter: 0 }
+	}
+}
+
+#[async_trait::async_trait]
+impl RecoveryHandle for FailingRecoveryHandle {
+	async fn send_recovery_msg(
+		&mut self,
+		message: AvailabilityRecoveryMessage,
+		origin: &'static str,
+	) {
+		// For every 5th block we immediately signal unavailability to trigger
+		// a retry.
+		if self.counter % 5 == 0 {
+			let AvailabilityRecoveryMessage::RecoverAvailableData(_, _, _, back_sender) = message;
+			tracing::info!(target: LOG_TARGET, "Failing pov recovery.");
+			back_sender
+				.send(Err(RecoveryError::Unavailable))
+				.expect("Return channel should work here.");
+		} else {
+			self.overseer_handle.send_msg(message, origin).await;
+		}
+		self.counter += 1;
+	}
+}
+
 /// Starts a `ServiceBuilder` for a full service.
 ///
 /// Use this macro if you don't actually need the full service, but just the builder in order to
@@ -236,6 +276,7 @@ pub async fn start_node_impl<RB>(
 	relay_chain_config: Configuration,
 	para_id: ParaId,
 	wrap_announce_block: Option<Box<dyn FnOnce(AnnounceBlockFn) -> AnnounceBlockFn>>,
+	fail_pov_recovery: bool,
 	rpc_ext_builder: RB,
 	consensus: Consensus,
 	collator_options: CollatorOptions,
@@ -320,6 +361,17 @@ where
 		.unwrap_or_else(|| announce_block);

 	let relay_chain_interface_for_closure = relay_chain_interface.clone();
+
+	let overseer_handle = relay_chain_interface
+		.overseer_handle()
+		.map_err(|e| sc_service::Error::Application(Box::new(e)))?;
+
+	let recovery_handle: Box<dyn RecoveryHandle> = if fail_pov_recovery {
+		Box::new(FailingRecoveryHandle::new(overseer_handle))
+	} else {
+		Box::new(overseer_handle)
+	};
+
 	if let Some(collator_key) = collator_key {
 		let parachain_consensus: Box<dyn ParachainConsensus<Block>> = match consensus {
 			Consensus::RelayChain => {
@@ -374,6 +426,7 @@ where
 			collator_key,
 			import_queue: import_queue_service,
 			relay_chain_slot_duration: Duration::from_secs(6),
+			recovery_handle,
 		};

 		start_collator(params).await?;
@@ -385,10 +438,8 @@ where
 			para_id,
 			relay_chain_interface,
 			import_queue: import_queue_service,
-			// The slot duration is currently used internally only to configure
-			// the recovery delay of pov-recovery. We don't want to wait for too
-			// long on the full node to recover, so we reduce this time here.
-			relay_chain_slot_duration: Duration::from_millis(6),
+			relay_chain_slot_duration: Duration::from_secs(6),
+			recovery_handle,
 		};

 		start_full_node(params)?;
@@ -600,6 +651,7 @@ impl TestNodeBuilder {
 			relay_chain_config,
 			self.para_id,
 			self.wrap_announce_block,
+			false,
 			|_| Ok(jsonrpsee::RpcModule::new(())),
 			self.consensus,
 			collator_options,
@@ -123,6 +123,9 @@ fn main() -> Result<(), sc_cli::Error> {
 				"Is collating: {}",
 				if config.role.is_authority() { "yes" } else { "no" }
 			);
+			if cli.fail_pov_recovery {
+				tracing::info!("PoV recovery failure enabled");
+			}

 			let collator_key = config.role.is_authority().then(|| CollatorPair::generate().0);

@@ -141,6 +144,7 @@ fn main() -> Result<(), sc_cli::Error> {
 					polkadot_config,
 					parachain_id,
 					cli.disable_block_announcements.then(wrap_announce_block),
+					cli.fail_pov_recovery,
 					|_| Ok(jsonrpsee::RpcModule::new(())),
 					consensus,
 					collator_options,
@@ -12,9 +12,10 @@ bob: is up within 60 seconds
 charlie: is up within 60 seconds
 one: is up within 60 seconds
 two: is up within 60 seconds
+eve: is up within 60 seconds

-# wait 30 blocks and register parachain
-validator-3: reports block height is at least 30 within 250 seconds
+# wait 20 blocks and register parachain
+validator-3: reports block height is at least 20 within 250 seconds
 validator-0: js-script ./register-para.js with "2000" within 240 seconds
 validator-0: parachain 2000 is registered within 300 seconds

@@ -22,5 +23,6 @@ validator-0: parachain 2000 is registered within 300 seconds
 bob: reports block height is at least 20 within 600 seconds
 alice: reports block height is at least 20 within 600 seconds
 charlie: reports block height is at least 20 within 600 seconds
-one: reports block height is at least 20 within 600 seconds
-two: reports block height is at least 20 within 600 seconds
+one: reports block height is at least 20 within 800 seconds
+two: reports block height is at least 20 within 800 seconds
+eve: reports block height is at least 20 within 800 seconds
@@ -40,7 +40,7 @@ add_to_genesis = false
  validator = true # collator
  image = "{{COL_IMAGE}}"
  command = "test-parachain"
-  args = ["-lparachain::availability=trace,sync=debug,parachain=debug,cumulus-pov-recovery=debug", "--use-null-consensus", "--disable-block-announcements", "--bootnodes {{'bob'|zombie('multiAddress')}}",  "--", "--reserved-only", "--reserved-nodes {{'ferdie'|zombie('multiAddress')}}"]
+  args = ["-lparachain::availability=trace,sync=debug,parachain=debug,cumulus-pov-recovery=debug,cumulus-consensus=debug", "--use-null-consensus", "--disable-block-announcements", "--bootnodes {{'bob'|zombie('multiAddress')}}",  "--", "--reserved-only", "--reserved-nodes {{'ferdie'|zombie('multiAddress')}}"]

  # run eve as a parachain full node
  [[parachains.collators]]
@@ -48,7 +48,15 @@ add_to_genesis = false
  validator = false # full node
  image = "{{COL_IMAGE}}"
  command = "test-parachain"
-  args = ["-lparachain::availability=trace,sync=debug,parachain=debug,cumulus-pov-recovery=debug", "--disable-block-announcements", "--bootnodes {{'bob'|zombie('multiAddress')}}","--", "--reserved-only", "--reserved-nodes {{'ferdie'|zombie('multiAddress')}}"]
+  args = ["-lparachain::availability=trace,sync=debug,parachain=debug,cumulus-pov-recovery=debug,cumulus-consensus=debug", "--disable-block-announcements", "--bootnodes {{'bob'|zombie('multiAddress')}}","--", "--reserved-only", "--reserved-nodes {{'ferdie'|zombie('multiAddress')}}"]
+
+  # we fail recovery for eve from time to time to test retries
+  [[parachains.collators]]
+  name = "eve"
+  validator = true # collator
+  image = "{{COL_IMAGE}}"
+  command = "test-parachain"
+  args = ["-lparachain::availability=trace,sync=debug,parachain=debug,cumulus-pov-recovery=debug,cumulus-consensus=debug", "--fail-pov-recovery", "--use-null-consensus", "--disable-block-announcements", "--bootnodes {{'bob'|zombie('multiAddress')}}",  "--", "--reserved-only", "--reserved-nodes {{'ferdie'|zombie('multiAddress')}}"]

  # run one as a RPC collator who does not produce blocks
  [[parachains.collators]]
@@ -56,7 +64,7 @@ add_to_genesis = false
  validator = true # collator
  image = "{{COL_IMAGE}}"
  command = "test-parachain"
-  args = ["-lparachain::availability=trace,sync=debug,parachain=debug,cumulus-pov-recovery=debug", "--use-null-consensus", "--disable-block-announcements", "--bootnodes {{'bob'|zombie('multiAddress')}}", "--relay-chain-rpc-url {{'ferdie'|zombie('wsUri')}}", "--", "--reserved-only", "--reserved-nodes {{'ferdie'|zombie('multiAddress')}}"]
+  args = ["-lparachain::availability=trace,sync=debug,parachain=debug,cumulus-pov-recovery=debug,cumulus-consensus=debug", "--use-null-consensus", "--disable-block-announcements", "--bootnodes {{'bob'|zombie('multiAddress')}}", "--relay-chain-rpc-url {{'ferdie'|zombie('wsUri')}}", "--", "--reserved-only", "--reserved-nodes {{'ferdie'|zombie('multiAddress')}}"]

  # run two as a RPC parachain full node
  [[parachains.collators]]
@@ -64,4 +72,4 @@ add_to_genesis = false
  validator = false # full node
  image = "{{COL_IMAGE}}"
  command = "test-parachain"
-  args = ["-lparachain::availability=trace,sync=debug,parachain=debug,cumulus-pov-recovery=debug", "--disable-block-announcements", "--bootnodes {{'bob'|zombie('multiAddress')}}", "--relay-chain-rpc-url {{'ferdie'|zombie('wsUri')}}", "--", "--reserved-only", "--reserved-nodes {{'ferdie'|zombie('multiAddress')}}"]
+  args = ["-lparachain::availability=trace,sync=debug,parachain=debug,cumulus-pov-recovery=debug,cumulus-consensus=debug", "--disable-block-announcements", "--bootnodes {{'bob'|zombie('multiAddress')}}", "--relay-chain-rpc-url {{'ferdie'|zombie('wsUri')}}", "--", "--reserved-only", "--reserved-nodes {{'ferdie'|zombie('multiAddress')}}"]