Add retry mechanism for pov-recovery, fix full-node pov-recovery (#2164)

* Increase delay for pov-recovery

* Update client/service/src/lib.rs

Co-authored-by: Bastian Köcher <git@kchr.de>

* Comment

* FMT

* Clear waiting_recovery when block is recovered or recovery failed

* Introduce recovery queue that preserved insertion order

* Better error logs

* Decrease slot duration

* Style improvements

* Add option to use unordered queue

* Maintain cache of finalized blocks

* Wait for one relay chain slot before recovery

* Make retries testable

* fmt

* Improve docs

* Improve docs

* Simplify RecoveryQueue

* Remove unwanted changes

* Adjust to comments

* Apply suggestions from code review

Co-authored-by: Bastian Köcher <git@kchr.de>

* Move recovery delay into the queue

* Check for finalized number

* Clean up

* Use timer

Co-authored-by: Bastian Köcher <git@kchr.de>

* Simplify implementation

* Revert "Use timer"

This reverts commit 3809eed840d3a09d54212f99486782ff80cdc1c9.

* Properly clear `to_recover` flag

---------

Co-authored-by: Bastian Köcher <git@kchr.de>
This commit is contained in:
Sebastian Kunert
2023-02-09 14:18:55 +01:00
committed by GitHub
parent b4d0992ca8
commit afcfd2404a
15 changed files with 419 additions and 161 deletions
+3
View File
@@ -49,6 +49,9 @@ pub struct TestCollatorCli {
#[arg(long)]
pub disable_block_announcements: bool,
#[arg(long)]
pub fail_pov_recovery: bool,
}
#[derive(Debug, clap::Subcommand)]
+56 -4
View File
@@ -34,6 +34,7 @@ use cumulus_client_consensus_common::{
ParachainBlockImport as TParachainBlockImport, ParachainCandidate, ParachainConsensus,
};
use cumulus_client_network::BlockAnnounceValidator;
use cumulus_client_pov_recovery::RecoveryHandle;
use cumulus_client_service::{
prepare_node_config, start_collator, start_full_node, StartCollatorParams, StartFullNodeParams,
};
@@ -45,6 +46,8 @@ use cumulus_relay_chain_minimal_node::build_minimal_relay_chain_node;
use cumulus_test_runtime::{Hash, Header, NodeBlock as Block, RuntimeApi};
use frame_system_rpc_runtime_api::AccountNonceApi;
use polkadot_node_subsystem::{errors::RecoveryError, messages::AvailabilityRecoveryMessage};
use polkadot_overseer::Handle as OverseerHandle;
use polkadot_primitives::{CollatorPair, Hash as PHash, PersistedValidationData};
use polkadot_service::ProvideRuntimeApi;
use sc_client_api::execution_extensions::ExecutionStrategies;
@@ -76,6 +79,8 @@ pub use cumulus_test_runtime as runtime;
pub use genesis::*;
pub use sp_keyring::Sr25519Keyring as Keyring;
const LOG_TARGET: &str = "cumulus-test-service";
/// A consensus that will never produce any block.
#[derive(Clone)]
struct NullConsensus;
@@ -126,6 +131,41 @@ pub type ParachainBlockImport = TParachainBlockImport<Block, Arc<Client>, Backen
/// Transaction pool type used by the test service
pub type TransactionPool = Arc<sc_transaction_pool::FullPool<Block, Client>>;
/// Recovery handle that fails regularly to simulate unavailable povs.
pub struct FailingRecoveryHandle {
overseer_handle: OverseerHandle,
counter: u32,
}
impl FailingRecoveryHandle {
/// Create a new FailingRecoveryHandle
pub fn new(overseer_handle: OverseerHandle) -> Self {
Self { overseer_handle, counter: 0 }
}
}
#[async_trait::async_trait]
impl RecoveryHandle for FailingRecoveryHandle {
async fn send_recovery_msg(
&mut self,
message: AvailabilityRecoveryMessage,
origin: &'static str,
) {
// For every 5th block we immediately signal unavailability to trigger
// a retry.
if self.counter % 5 == 0 {
let AvailabilityRecoveryMessage::RecoverAvailableData(_, _, _, back_sender) = message;
tracing::info!(target: LOG_TARGET, "Failing pov recovery.");
back_sender
.send(Err(RecoveryError::Unavailable))
.expect("Return channel should work here.");
} else {
self.overseer_handle.send_msg(message, origin).await;
}
self.counter += 1;
}
}
/// Starts a `ServiceBuilder` for a full service.
///
/// Use this macro if you don't actually need the full service, but just the builder in order to
@@ -236,6 +276,7 @@ pub async fn start_node_impl<RB>(
relay_chain_config: Configuration,
para_id: ParaId,
wrap_announce_block: Option<Box<dyn FnOnce(AnnounceBlockFn) -> AnnounceBlockFn>>,
fail_pov_recovery: bool,
rpc_ext_builder: RB,
consensus: Consensus,
collator_options: CollatorOptions,
@@ -320,6 +361,17 @@ where
.unwrap_or_else(|| announce_block);
let relay_chain_interface_for_closure = relay_chain_interface.clone();
let overseer_handle = relay_chain_interface
.overseer_handle()
.map_err(|e| sc_service::Error::Application(Box::new(e)))?;
let recovery_handle: Box<dyn RecoveryHandle> = if fail_pov_recovery {
Box::new(FailingRecoveryHandle::new(overseer_handle))
} else {
Box::new(overseer_handle)
};
if let Some(collator_key) = collator_key {
let parachain_consensus: Box<dyn ParachainConsensus<Block>> = match consensus {
Consensus::RelayChain => {
@@ -374,6 +426,7 @@ where
collator_key,
import_queue: import_queue_service,
relay_chain_slot_duration: Duration::from_secs(6),
recovery_handle,
};
start_collator(params).await?;
@@ -385,10 +438,8 @@ where
para_id,
relay_chain_interface,
import_queue: import_queue_service,
// The slot duration is currently used internally only to configure
// the recovery delay of pov-recovery. We don't want to wait for too
// long on the full node to recover, so we reduce this time here.
relay_chain_slot_duration: Duration::from_millis(6),
relay_chain_slot_duration: Duration::from_secs(6),
recovery_handle,
};
start_full_node(params)?;
@@ -600,6 +651,7 @@ impl TestNodeBuilder {
relay_chain_config,
self.para_id,
self.wrap_announce_block,
false,
|_| Ok(jsonrpsee::RpcModule::new(())),
self.consensus,
collator_options,
+4
View File
@@ -123,6 +123,9 @@ fn main() -> Result<(), sc_cli::Error> {
"Is collating: {}",
if config.role.is_authority() { "yes" } else { "no" }
);
if cli.fail_pov_recovery {
tracing::info!("PoV recovery failure enabled");
}
let collator_key = config.role.is_authority().then(|| CollatorPair::generate().0);
@@ -141,6 +144,7 @@ fn main() -> Result<(), sc_cli::Error> {
polkadot_config,
parachain_id,
cli.disable_block_announcements.then(wrap_announce_block),
cli.fail_pov_recovery,
|_| Ok(jsonrpsee::RpcModule::new(())),
consensus,
collator_options,