Add retry mechanism for pov-recovery, fix full-node pov-recovery (#2164)

* Increase delay for pov-recovery

* Update client/service/src/lib.rs

Co-authored-by: Bastian Köcher <git@kchr.de>

* Comment

* FMT

* Clear waiting_recovery when block is recovered or recovery failed

* Introduce recovery queue that preserved insertion order

* Better error logs

* Decrease slot duration

* Style improvements

* Add option to use unordered queue

* Maintain cache of finalized blocks

* Wait for one relay chain slot before recovery

* Make retries testable

* fmt

* Improve docs

* Improve docs

* Simplify RecoveryQueue

* Remove unwanted changes

* Adjust to comments

* Apply suggestions from code review

Co-authored-by: Bastian Köcher <git@kchr.de>

* Move recovery delay into the queue

* Check for finalized number

* Clean up

* Use timer

Co-authored-by: Bastian Köcher <git@kchr.de>

* Simplify implementation

* Revert "Use timer"

This reverts commit 3809eed840d3a09d54212f99486782ff80cdc1c9.

* Properly clear `to_recover` flag

---------

Co-authored-by: Bastian Köcher <git@kchr.de>
This commit is contained in:
Sebastian Kunert
2023-02-09 14:18:55 +01:00
committed by GitHub
parent b3d68426a2
commit 588bdad7f6
15 changed files with 419 additions and 161 deletions
@@ -18,12 +18,13 @@ use sp_runtime::traits::Block as BlockT;
use polkadot_node_primitives::AvailableData;
use polkadot_node_subsystem::messages::AvailabilityRecoveryMessage;
use polkadot_overseer::Handle as OverseerHandle;
use futures::{channel::oneshot, stream::FuturesUnordered, Future, FutureExt, StreamExt};
use std::{collections::HashSet, pin::Pin};
use crate::RecoveryHandle;
/// The active candidate recovery.
///
/// This handles the candidate recovery and tracks the activate recoveries.
@@ -34,12 +35,12 @@ pub(crate) struct ActiveCandidateRecovery<Block: BlockT> {
>,
/// The block hashes of the candidates currently being recovered.
candidates: HashSet<Block::Hash>,
overseer_handle: OverseerHandle,
recovery_handle: Box<dyn RecoveryHandle>,
}
impl<Block: BlockT> ActiveCandidateRecovery<Block> {
pub fn new(overseer_handle: OverseerHandle) -> Self {
Self { recoveries: Default::default(), candidates: Default::default(), overseer_handle }
pub fn new(recovery_handle: Box<dyn RecoveryHandle>) -> Self {
Self { recoveries: Default::default(), candidates: Default::default(), recovery_handle }
}
/// Recover the given `candidate`.
@@ -50,8 +51,8 @@ impl<Block: BlockT> ActiveCandidateRecovery<Block> {
) {
let (tx, rx) = oneshot::channel();
self.overseer_handle
.send_msg(
self.recovery_handle
.send_recovery_msg(
AvailabilityRecoveryMessage::RecoverAvailableData(
candidate.receipt.clone(),
candidate.session_index,
@@ -90,11 +91,6 @@ impl<Block: BlockT> ActiveCandidateRecovery<Block> {
);
}
/// Returns if the given `candidate` is being recovered.
pub fn is_being_recovered(&self, candidate: &Block::Hash) -> bool {
self.candidates.contains(candidate)
}
/// Waits for the next recovery.
///
/// If the returned [`AvailableData`] is `None`, it means that the recovery failed.