feat: initialize Kurdistan SDK - independent fork of Polkadot SDK

2025-12-13 15:44:15 +03:00
commit e4778b4576
6838 changed files with 1847450 additions and 0 deletions
@@ -0,0 +1,925 @@
+// Copyright (C) Parity Technologies (UK) Ltd.
+// This file is part of Pezkuwi.
+
+// Pezkuwi is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Pezkuwi is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Pezkuwi.  If not, see <http://www.gnu.org/licenses/>.
+
+//! Availability Recovery Subsystem of Pezkuwi.
+
+#![warn(missing_docs)]
+
+use std::{
+	collections::{BTreeMap, VecDeque},
+	iter::Iterator,
+	num::NonZeroUsize,
+	pin::Pin,
+};
+
+use futures::{
+	channel::oneshot,
+	future::{Future, FutureExt, RemoteHandle},
+	pin_mut,
+	prelude::*,
+	sink::SinkExt,
+	stream::{FuturesUnordered, StreamExt},
+	task::{Context, Poll},
+};
+use sc_network::ProtocolName;
+use schnellru::{ByLength, LruMap};
+use task::{
+	FetchChunks, FetchChunksParams, FetchFull, FetchFullParams, FetchSystematicChunks,
+	FetchSystematicChunksParams,
+};
+
+use pezkuwi_erasure_coding::{
+	branches, obtain_chunks_v1, recovery_threshold, systematic_recovery_threshold,
+	Error as ErasureEncodingError,
+};
+use task::{RecoveryParams, RecoveryStrategy, RecoveryTask};
+
+use error::{log_error, Error, FatalError, Result};
+use pezkuwi_node_network_protocol::{
+	request_response::{
+		v1 as request_v1, v2 as request_v2, IncomingRequestReceiver, IsRequest, ReqProtocolNames,
+	},
+	UnifiedReputationChange as Rep,
+};
+use pezkuwi_node_primitives::AvailableData;
+use pezkuwi_node_subsystem::{
+	errors::RecoveryError,
+	messages::{AvailabilityRecoveryMessage, AvailabilityStoreMessage},
+	overseer, ActiveLeavesUpdate, FromOrchestra, OverseerSignal, SpawnedSubsystem,
+	SubsystemContext, SubsystemError,
+};
+use pezkuwi_node_subsystem_util::{
+	availability_chunks::availability_chunk_indices,
+	runtime::{ExtendedSessionInfo, RuntimeInfo},
+};
+use pezkuwi_primitives::{
+	node_features, BlockNumber, CandidateHash, CandidateReceiptV2 as CandidateReceipt, ChunkIndex,
+	CoreIndex, GroupIndex, Hash, SessionIndex, ValidatorIndex,
+};
+
+mod error;
+mod futures_undead;
+mod metrics;
+mod task;
+pub use metrics::Metrics;
+
+#[cfg(test)]
+mod tests;
+
+type RecoveryResult = std::result::Result<AvailableData, RecoveryError>;
+
+const LOG_TARGET: &str = "teyrchain::availability-recovery";
+
+// Size of the LRU cache where we keep recovered data.
+const LRU_SIZE: u32 = 16;
+
+const COST_INVALID_REQUEST: Rep = Rep::CostMajor("Peer sent unparsable request");
+
+/// PoV size limit in bytes for which prefer fetching from backers. (conservative, Pezkuwi for now)
+pub(crate) const CONSERVATIVE_FETCH_CHUNKS_THRESHOLD: usize = 1 * 1024 * 1024;
+/// PoV size limit in bytes for which prefer fetching from backers. (Kusama and all testnets)
+pub const FETCH_CHUNKS_THRESHOLD: usize = 4 * 1024 * 1024;
+
+#[derive(Clone, PartialEq)]
+/// The strategy we use to recover the PoV.
+pub enum RecoveryStrategyKind {
+	/// We try the backing group first if PoV size is lower than specified, then fallback to
+	/// validator chunks.
+	BackersFirstIfSizeLower(usize),
+	/// We try the backing group first if PoV size is lower than specified, then fallback to
+	/// systematic chunks. Regular chunk recovery as a last resort.
+	BackersFirstIfSizeLowerThenSystematicChunks(usize),
+
+	/// The following variants are only helpful for integration tests.
+	///
+	/// We always try the backing group first, then fallback to validator chunks.
+	#[allow(dead_code)]
+	BackersFirstAlways,
+	/// We always recover using validator chunks.
+	#[allow(dead_code)]
+	ChunksAlways,
+	/// First try the backing group. Then systematic chunks.
+	#[allow(dead_code)]
+	BackersThenSystematicChunks,
+	/// Always recover using systematic chunks, fall back to regular chunks.
+	#[allow(dead_code)]
+	SystematicChunks,
+}
+
+/// The Availability Recovery Subsystem.
+pub struct AvailabilityRecoverySubsystem {
+	/// PoV recovery strategy to use.
+	recovery_strategy_kind: RecoveryStrategyKind,
+	// If this is true, do not request data from the availability store.
+	/// This is the useful for nodes where the
+	/// availability-store subsystem is not expected to run,
+	/// such as collators.
+	bypass_availability_store: bool,
+	/// Receiver for available data requests.
+	req_receiver: IncomingRequestReceiver<request_v1::AvailableDataFetchingRequest>,
+	/// Metrics for this subsystem.
+	metrics: Metrics,
+	/// The type of check to perform after available data was recovered.
+	post_recovery_check: PostRecoveryCheck,
+	/// Full protocol name for ChunkFetchingV1.
+	req_v1_protocol_name: ProtocolName,
+	/// Full protocol name for ChunkFetchingV2.
+	req_v2_protocol_name: ProtocolName,
+}
+
+#[derive(Clone, PartialEq, Debug)]
+/// The type of check to perform after available data was recovered.
+enum PostRecoveryCheck {
+	/// Reencode the data and check erasure root. For validators.
+	Reencode,
+	/// Only check the pov hash. For collators only.
+	PovHash,
+}
+
+/// Expensive erasure coding computations that we want to run on a blocking thread.
+enum ErasureTask {
+	/// Reconstructs `AvailableData` from chunks given `n_validators`.
+	Reconstruct(
+		usize,
+		BTreeMap<ChunkIndex, Vec<u8>>,
+		oneshot::Sender<std::result::Result<AvailableData, ErasureEncodingError>>,
+	),
+	/// Re-encode `AvailableData` into erasure chunks in order to verify the provided root hash of
+	/// the Merkle tree.
+	Reencode(usize, Hash, AvailableData, oneshot::Sender<Option<AvailableData>>),
+}
+
+/// Re-encode the data into erasure chunks in order to verify
+/// the root hash of the provided Merkle tree, which is built
+/// on-top of the encoded chunks.
+///
+/// This (expensive) check is necessary, as otherwise we can't be sure that some chunks won't have
+/// been tampered with by the backers, which would result in some validators considering the data
+/// valid and some invalid as having fetched different set of chunks. The checking of the Merkle
+/// proof for individual chunks only gives us guarantees, that we have fetched a chunk belonging to
+/// a set the backers have committed to.
+///
+/// NOTE: It is fine to do this check with already decoded data, because if the decoding failed for
+/// some validators, we can be sure that chunks have been tampered with (by the backers) or the
+/// data was invalid to begin with. In the former case, validators fetching valid chunks will see
+/// invalid data as well, because the root won't match. In the latter case the situation is the
+/// same for anyone anyways.
+fn reconstructed_data_matches_root(
+	n_validators: usize,
+	expected_root: &Hash,
+	data: &AvailableData,
+	metrics: &Metrics,
+) -> bool {
+	let _timer = metrics.time_reencode_chunks();
+
+	let chunks = match obtain_chunks_v1(n_validators, data) {
+		Ok(chunks) => chunks,
+		Err(e) => {
+			gum::debug!(
+				target: LOG_TARGET,
+				err = ?e,
+				"Failed to obtain chunks",
+			);
+			return false;
+		},
+	};
+
+	let branches = branches(&chunks);
+
+	branches.root() == *expected_root
+}
+
+/// Accumulate all awaiting sides for some particular `AvailableData`.
+struct RecoveryHandle {
+	candidate_hash: CandidateHash,
+	remote: RemoteHandle<RecoveryResult>,
+	awaiting: Vec<oneshot::Sender<RecoveryResult>>,
+}
+
+impl Future for RecoveryHandle {
+	type Output = Option<(CandidateHash, RecoveryResult)>;
+
+	fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+		let mut indices_to_remove = Vec::new();
+		for (i, awaiting) in self.awaiting.iter_mut().enumerate().rev() {
+			if let Poll::Ready(()) = awaiting.poll_canceled(cx) {
+				indices_to_remove.push(i);
+			}
+		}
+
+		// these are reverse order, so remove is fine.
+		for index in indices_to_remove {
+			gum::debug!(
+				target: LOG_TARGET,
+				candidate_hash = ?self.candidate_hash,
+				"Receiver for available data dropped.",
+			);
+
+			self.awaiting.swap_remove(index);
+		}
+
+		if self.awaiting.is_empty() {
+			gum::debug!(
+				target: LOG_TARGET,
+				candidate_hash = ?self.candidate_hash,
+				"All receivers for available data dropped.",
+			);
+
+			return Poll::Ready(None);
+		}
+
+		let remote = &mut self.remote;
+		futures::pin_mut!(remote);
+		let result = futures::ready!(remote.poll(cx));
+
+		for awaiting in self.awaiting.drain(..) {
+			let _ = awaiting.send(result.clone());
+		}
+
+		Poll::Ready(Some((self.candidate_hash, result)))
+	}
+}
+
+/// Cached result of an availability recovery operation.
+#[derive(Debug, Clone)]
+enum CachedRecovery {
+	/// Availability was successfully retrieved before.
+	Valid(AvailableData),
+	/// Availability was successfully retrieved before, but was found to be invalid.
+	Invalid,
+}
+
+impl CachedRecovery {
+	/// Convert back to	`Result` to deliver responses.
+	fn into_result(self) -> RecoveryResult {
+		match self {
+			Self::Valid(d) => Ok(d),
+			Self::Invalid => Err(RecoveryError::Invalid),
+		}
+	}
+}
+
+impl TryFrom<RecoveryResult> for CachedRecovery {
+	type Error = ();
+	fn try_from(o: RecoveryResult) -> std::result::Result<CachedRecovery, Self::Error> {
+		match o {
+			Ok(d) => Ok(Self::Valid(d)),
+			Err(RecoveryError::Invalid) => Ok(Self::Invalid),
+			// We don't want to cache unavailable state, as that state might change, so if
+			// requested again we want to try again!
+			Err(RecoveryError::Unavailable) => Err(()),
+			Err(RecoveryError::ChannelClosed) => Err(()),
+		}
+	}
+}
+
+struct State {
+	/// Each recovery task is implemented as its own async task,
+	/// and these handles are for communicating with them.
+	ongoing_recoveries: FuturesUnordered<RecoveryHandle>,
+
+	/// A recent block hash for which state should be available.
+	live_block: (BlockNumber, Hash),
+
+	/// An LRU cache of recently recovered data.
+	availability_lru: LruMap<CandidateHash, CachedRecovery>,
+
+	/// Cached runtime info.
+	runtime_info: RuntimeInfo,
+}
+
+impl Default for State {
+	fn default() -> Self {
+		Self {
+			ongoing_recoveries: FuturesUnordered::new(),
+			live_block: (0, Hash::default()),
+			availability_lru: LruMap::new(ByLength::new(LRU_SIZE)),
+			runtime_info: RuntimeInfo::new(None),
+		}
+	}
+}
+
+#[overseer::subsystem(AvailabilityRecovery, error=SubsystemError, prefix=self::overseer)]
+impl<Context> AvailabilityRecoverySubsystem {
+	fn start(self, ctx: Context) -> SpawnedSubsystem {
+		let future = self
+			.run(ctx)
+			.map_err(|e| SubsystemError::with_origin("availability-recovery", e))
+			.boxed();
+		SpawnedSubsystem { name: "availability-recovery-subsystem", future }
+	}
+}
+
+/// Handles a signal from the overseer.
+/// Returns true if subsystem receives a deadly signal.
+async fn handle_signal(state: &mut State, signal: OverseerSignal) -> bool {
+	match signal {
+		OverseerSignal::Conclude => true,
+		OverseerSignal::ActiveLeaves(ActiveLeavesUpdate { activated, .. }) => {
+			// if activated is non-empty, set state.live_block to the highest block in `activated`
+			if let Some(activated) = activated {
+				if activated.number > state.live_block.0 {
+					state.live_block = (activated.number, activated.hash)
+				}
+			}
+
+			false
+		},
+		OverseerSignal::BlockFinalized(_, _) => false,
+	}
+}
+
+/// Machinery around launching recovery tasks into the background.
+#[overseer::contextbounds(AvailabilityRecovery, prefix = self::overseer)]
+async fn launch_recovery_task<Context>(
+	state: &mut State,
+	ctx: &mut Context,
+	response_sender: oneshot::Sender<RecoveryResult>,
+	recovery_strategies: VecDeque<Box<dyn RecoveryStrategy<<Context as SubsystemContext>::Sender>>>,
+	params: RecoveryParams,
+) -> Result<()> {
+	let candidate_hash = params.candidate_hash;
+	let recovery_task = RecoveryTask::new(ctx.sender().clone(), params, recovery_strategies);
+
+	let (remote, remote_handle) = recovery_task.run().remote_handle();
+
+	state.ongoing_recoveries.push(RecoveryHandle {
+		candidate_hash,
+		remote: remote_handle,
+		awaiting: vec![response_sender],
+	});
+
+	ctx.spawn("recovery-task", Box::pin(remote))
+		.map_err(|err| Error::SpawnTask(err))
+}
+
+/// Handles an availability recovery request.
+#[overseer::contextbounds(AvailabilityRecovery, prefix = self::overseer)]
+async fn handle_recover<Context>(
+	state: &mut State,
+	ctx: &mut Context,
+	receipt: CandidateReceipt,
+	session_index: SessionIndex,
+	backing_group: Option<GroupIndex>,
+	response_sender: oneshot::Sender<RecoveryResult>,
+	metrics: &Metrics,
+	erasure_task_tx: futures::channel::mpsc::Sender<ErasureTask>,
+	recovery_strategy_kind: RecoveryStrategyKind,
+	bypass_availability_store: bool,
+	post_recovery_check: PostRecoveryCheck,
+	maybe_core_index: Option<CoreIndex>,
+	req_v1_protocol_name: ProtocolName,
+	req_v2_protocol_name: ProtocolName,
+) -> Result<()> {
+	let candidate_hash = receipt.hash();
+
+	if let Some(result) =
+		state.availability_lru.get(&candidate_hash).cloned().map(|v| v.into_result())
+	{
+		return response_sender.send(result).map_err(|_| Error::CanceledResponseSender);
+	}
+
+	if let Some(i) =
+		state.ongoing_recoveries.iter_mut().find(|i| i.candidate_hash == candidate_hash)
+	{
+		i.awaiting.push(response_sender);
+		return Ok(());
+	}
+
+	let session_info_res = state
+		.runtime_info
+		.get_session_info_by_index(ctx.sender(), state.live_block.1, session_index)
+		.await;
+
+	match session_info_res {
+		Ok(ExtendedSessionInfo { session_info, node_features, .. }) => {
+			let mut backer_group = None;
+			let n_validators = session_info.validators.len();
+			let systematic_threshold = systematic_recovery_threshold(n_validators)?;
+			let mut recovery_strategies: VecDeque<
+				Box<dyn RecoveryStrategy<<Context as SubsystemContext>::Sender>>,
+			> = VecDeque::with_capacity(3);
+
+			if let Some(backing_group) = backing_group {
+				if let Some(backing_validators) = session_info.validator_groups.get(backing_group) {
+					let mut small_pov_size = true;
+
+					match recovery_strategy_kind {
+						RecoveryStrategyKind::BackersFirstIfSizeLower(fetch_chunks_threshold) |
+						RecoveryStrategyKind::BackersFirstIfSizeLowerThenSystematicChunks(
+							fetch_chunks_threshold,
+						) => {
+							// Get our own chunk size to get an estimate of the PoV size.
+							let chunk_size: Result<Option<usize>> =
+								query_chunk_size(ctx, candidate_hash).await;
+							if let Ok(Some(chunk_size)) = chunk_size {
+								let pov_size_estimate = chunk_size * systematic_threshold;
+								small_pov_size = pov_size_estimate < fetch_chunks_threshold;
+
+								if small_pov_size {
+									gum::trace!(
+										target: LOG_TARGET,
+										?candidate_hash,
+										pov_size_estimate,
+										fetch_chunks_threshold,
+										"Prefer fetch from backing group",
+									);
+								}
+							} else {
+								// we have a POV limit but were not able to query the chunk size, so
+								// don't use the backing group.
+								small_pov_size = false;
+							}
+						},
+						_ => {},
+					};
+
+					match (&recovery_strategy_kind, small_pov_size) {
+						(RecoveryStrategyKind::BackersFirstAlways, _) |
+						(RecoveryStrategyKind::BackersFirstIfSizeLower(_), true) |
+						(
+							RecoveryStrategyKind::BackersFirstIfSizeLowerThenSystematicChunks(_),
+							true,
+						) |
+						(RecoveryStrategyKind::BackersThenSystematicChunks, _) =>
+							recovery_strategies.push_back(Box::new(FetchFull::new(
+								FetchFullParams { validators: backing_validators.to_vec() },
+							))),
+						_ => {},
+					};
+
+					backer_group = Some(backing_validators);
+				}
+			}
+
+			let chunk_mapping_enabled = if let Some(&true) = node_features
+				.get(usize::from(node_features::FeatureIndex::AvailabilityChunkMapping as u8))
+				.as_deref()
+			{
+				true
+			} else {
+				false
+			};
+
+			// We can only attempt systematic recovery if we received the core index of the
+			// candidate and chunk mapping is enabled.
+			if let Some(core_index) = maybe_core_index {
+				if matches!(
+					recovery_strategy_kind,
+					RecoveryStrategyKind::BackersThenSystematicChunks |
+						RecoveryStrategyKind::SystematicChunks |
+						RecoveryStrategyKind::BackersFirstIfSizeLowerThenSystematicChunks(_)
+				) && chunk_mapping_enabled
+				{
+					let chunk_indices =
+						availability_chunk_indices(node_features, n_validators, core_index)?;
+
+					let chunk_indices: VecDeque<_> = chunk_indices
+						.iter()
+						.enumerate()
+						.map(|(v_index, c_index)| {
+							(
+								*c_index,
+								ValidatorIndex(
+									u32::try_from(v_index)
+										.expect("validator count should not exceed u32"),
+								),
+							)
+						})
+						.collect();
+
+					// Only get the validators according to the threshold.
+					let validators = chunk_indices
+						.clone()
+						.into_iter()
+						.filter(|(c_index, _)| {
+							usize::try_from(c_index.0)
+								.expect("usize is at least u32 bytes on all modern targets.") <
+								systematic_threshold
+						})
+						.collect();
+
+					recovery_strategies.push_back(Box::new(FetchSystematicChunks::new(
+						FetchSystematicChunksParams {
+							validators,
+							backers: backer_group.map(|v| v.to_vec()).unwrap_or_else(|| vec![]),
+						},
+					)));
+				}
+			}
+
+			recovery_strategies.push_back(Box::new(FetchChunks::new(FetchChunksParams {
+				n_validators: session_info.validators.len(),
+			})));
+
+			let session_info = session_info.clone();
+
+			let n_validators = session_info.validators.len();
+
+			launch_recovery_task(
+				state,
+				ctx,
+				response_sender,
+				recovery_strategies,
+				RecoveryParams {
+					validator_authority_keys: session_info.discovery_keys.clone(),
+					n_validators,
+					threshold: recovery_threshold(n_validators)?,
+					systematic_threshold,
+					candidate_hash,
+					erasure_root: receipt.descriptor.erasure_root(),
+					metrics: metrics.clone(),
+					bypass_availability_store,
+					post_recovery_check,
+					pov_hash: receipt.descriptor.pov_hash(),
+					req_v1_protocol_name,
+					req_v2_protocol_name,
+					chunk_mapping_enabled,
+					erasure_task_tx,
+				},
+			)
+			.await
+		},
+		Err(_) => {
+			response_sender
+				.send(Err(RecoveryError::Unavailable))
+				.map_err(|_| Error::CanceledResponseSender)?;
+
+			Err(Error::SessionInfoUnavailable(state.live_block.1))
+		},
+	}
+}
+
+/// Queries the full `AvailableData` from av-store.
+#[overseer::contextbounds(AvailabilityRecovery, prefix = self::overseer)]
+async fn query_full_data<Context>(
+	ctx: &mut Context,
+	candidate_hash: CandidateHash,
+) -> Result<Option<AvailableData>> {
+	let (tx, rx) = oneshot::channel();
+	ctx.send_message(AvailabilityStoreMessage::QueryAvailableData(candidate_hash, tx))
+		.await;
+
+	rx.await.map_err(Error::CanceledQueryFullData)
+}
+
+/// Queries a chunk from av-store.
+#[overseer::contextbounds(AvailabilityRecovery, prefix = self::overseer)]
+async fn query_chunk_size<Context>(
+	ctx: &mut Context,
+	candidate_hash: CandidateHash,
+) -> Result<Option<usize>> {
+	let (tx, rx) = oneshot::channel();
+	ctx.send_message(AvailabilityStoreMessage::QueryChunkSize(candidate_hash, tx))
+		.await;
+
+	rx.await.map_err(Error::CanceledQueryFullData)
+}
+
+#[overseer::contextbounds(AvailabilityRecovery, prefix = self::overseer)]
+impl AvailabilityRecoverySubsystem {
+	/// Create a new instance of `AvailabilityRecoverySubsystem` suitable for collator nodes,
+	/// which never requests the `AvailabilityStoreSubsystem` subsystem and only checks the POV hash
+	/// instead of reencoding the available data.
+	pub fn for_collator(
+		fetch_chunks_threshold: Option<usize>,
+		req_receiver: IncomingRequestReceiver<request_v1::AvailableDataFetchingRequest>,
+		req_protocol_names: &ReqProtocolNames,
+		metrics: Metrics,
+	) -> Self {
+		Self {
+			recovery_strategy_kind: RecoveryStrategyKind::BackersFirstIfSizeLower(
+				fetch_chunks_threshold.unwrap_or(CONSERVATIVE_FETCH_CHUNKS_THRESHOLD),
+			),
+			bypass_availability_store: true,
+			post_recovery_check: PostRecoveryCheck::PovHash,
+			req_receiver,
+			metrics,
+			req_v1_protocol_name: req_protocol_names
+				.get_name(request_v1::ChunkFetchingRequest::PROTOCOL),
+			req_v2_protocol_name: req_protocol_names
+				.get_name(request_v2::ChunkFetchingRequest::PROTOCOL),
+		}
+	}
+
+	/// Create an optimised new instance of `AvailabilityRecoverySubsystem` suitable for validator
+	/// nodes, which:
+	/// - for small POVs (over the `fetch_chunks_threshold` or the
+	///   `CONSERVATIVE_FETCH_CHUNKS_THRESHOLD`), it attempts full recovery from backers, if backing
+	///   group supplied.
+	/// - for large POVs, attempts systematic recovery, if core_index supplied and
+	///   AvailabilityChunkMapping node feature is enabled.
+	/// - as a last resort, attempt regular chunk recovery from all validators.
+	pub fn for_validator(
+		fetch_chunks_threshold: Option<usize>,
+		req_receiver: IncomingRequestReceiver<request_v1::AvailableDataFetchingRequest>,
+		req_protocol_names: &ReqProtocolNames,
+		metrics: Metrics,
+	) -> Self {
+		Self {
+			recovery_strategy_kind:
+				RecoveryStrategyKind::BackersFirstIfSizeLowerThenSystematicChunks(
+					fetch_chunks_threshold.unwrap_or(CONSERVATIVE_FETCH_CHUNKS_THRESHOLD),
+				),
+			bypass_availability_store: false,
+			post_recovery_check: PostRecoveryCheck::Reencode,
+			req_receiver,
+			metrics,
+			req_v1_protocol_name: req_protocol_names
+				.get_name(request_v1::ChunkFetchingRequest::PROTOCOL),
+			req_v2_protocol_name: req_protocol_names
+				.get_name(request_v2::ChunkFetchingRequest::PROTOCOL),
+		}
+	}
+
+	/// Customise the recovery strategy kind
+	/// Currently only useful for tests.
+	#[cfg(any(test, feature = "subsystem-benchmarks"))]
+	pub fn with_recovery_strategy_kind(
+		req_receiver: IncomingRequestReceiver<request_v1::AvailableDataFetchingRequest>,
+		req_protocol_names: &ReqProtocolNames,
+		metrics: Metrics,
+		recovery_strategy_kind: RecoveryStrategyKind,
+	) -> Self {
+		Self {
+			recovery_strategy_kind,
+			bypass_availability_store: false,
+			post_recovery_check: PostRecoveryCheck::Reencode,
+			req_receiver,
+			metrics,
+			req_v1_protocol_name: req_protocol_names
+				.get_name(request_v1::ChunkFetchingRequest::PROTOCOL),
+			req_v2_protocol_name: req_protocol_names
+				.get_name(request_v2::ChunkFetchingRequest::PROTOCOL),
+		}
+	}
+
+	/// Starts the inner subsystem loop.
+	pub async fn run<Context>(self, mut ctx: Context) -> std::result::Result<(), FatalError> {
+		let mut state = State::default();
+		let Self {
+			mut req_receiver,
+			metrics,
+			recovery_strategy_kind,
+			bypass_availability_store,
+			post_recovery_check,
+			req_v1_protocol_name,
+			req_v2_protocol_name,
+		} = self;
+
+		let (erasure_task_tx, erasure_task_rx) = futures::channel::mpsc::channel(16);
+		let mut erasure_task_rx = erasure_task_rx.fuse();
+
+		// `ThreadPoolBuilder` spawns the tasks using `spawn_blocking`. For each worker there will
+		// be a `mpsc` channel created. Each of these workers take the `Receiver` and poll it in an
+		// infinite loop. All of the sender ends of the channel are sent as a vec which we then use
+		// to create a `Cycle` iterator. We use this iterator to assign work in a round-robin
+		// fashion to the workers in the pool.
+		//
+		// How work is dispatched to the pool from the recovery tasks:
+		// - Once a recovery task finishes retrieving the availability data, it needs to reconstruct
+		//   from chunks and/or
+		// re-encode the data which are heavy CPU computations.
+		// To do so it sends an `ErasureTask` to the main loop via the `erasure_task` channel, and
+		// waits for the results over a `oneshot` channel.
+		// - In the subsystem main loop we poll the `erasure_task_rx` receiver.
+		// - We forward the received `ErasureTask` to the `next()` sender yielded by the `Cycle`
+		//   iterator.
+		// - Some worker thread handles it and sends the response over the `oneshot` channel.
+
+		// Create a thread pool with 2 workers.
+		let mut to_pool = ThreadPoolBuilder::build(
+			// Pool is guaranteed to have at least 1 worker thread.
+			NonZeroUsize::new(2).expect("There are 2 threads; qed"),
+			metrics.clone(),
+			&mut ctx,
+		)
+		.into_iter()
+		.cycle();
+
+		loop {
+			let recv_req = req_receiver.recv(|| vec![COST_INVALID_REQUEST]).fuse();
+			pin_mut!(recv_req);
+			let res = futures::select! {
+				erasure_task = erasure_task_rx.next() => {
+					match erasure_task {
+						Some(task) => {
+							to_pool
+								.next()
+								.expect("Pool size is `NonZeroUsize`; qed")
+								.send(task)
+								.await
+								.map_err(|_| RecoveryError::ChannelClosed)
+						},
+						None => {
+							Err(RecoveryError::ChannelClosed)
+						}
+					}.map_err(Into::into)
+				}
+				signal = ctx.recv().fuse() => {
+					match signal {
+						Ok(signal) => {
+							match signal {
+								FromOrchestra::Signal(signal) => if handle_signal(
+									&mut state,
+									signal,
+								).await {
+									gum::debug!(target: LOG_TARGET, "subsystem concluded");
+									return Ok(());
+								} else {
+									Ok(())
+								},
+								FromOrchestra::Communication {
+									msg: AvailabilityRecoveryMessage::RecoverAvailableData(
+										receipt,
+										session_index,
+										maybe_backing_group,
+										maybe_core_index,
+										response_sender,
+									)
+								} => handle_recover(
+										&mut state,
+										&mut ctx,
+										receipt,
+										session_index,
+										maybe_backing_group,
+										response_sender,
+										&metrics,
+										erasure_task_tx.clone(),
+										recovery_strategy_kind.clone(),
+										bypass_availability_store,
+										post_recovery_check.clone(),
+										maybe_core_index,
+										req_v1_protocol_name.clone(),
+										req_v2_protocol_name.clone(),
+									).await
+							}
+						},
+						Err(e) => Err(Error::SubsystemReceive(e))
+					}
+				}
+				in_req = recv_req => {
+					match in_req {
+						Ok(req) => {
+							if bypass_availability_store {
+								gum::debug!(
+									target: LOG_TARGET,
+									"Skipping request to availability-store.",
+								);
+								let _ = req.send_response(None.into());
+								Ok(())
+							} else {
+								match query_full_data(&mut ctx, req.payload.candidate_hash).await {
+									Ok(res) => {
+										let _ = req.send_response(res.into());
+										Ok(())
+									}
+									Err(e) => {
+										let _ = req.send_response(None.into());
+										Err(e)
+									}
+								}
+							}
+						}
+						Err(e) => Err(Error::IncomingRequest(e))
+					}
+				}
+				output = state.ongoing_recoveries.select_next_some() => {
+					let mut res = Ok(());
+					if let Some((candidate_hash, result)) = output {
+						if let Err(ref e) = result {
+							res = Err(Error::Recovery(e.clone()));
+						}
+
+						if let Ok(recovery) = CachedRecovery::try_from(result) {
+							state.availability_lru.insert(candidate_hash, recovery);
+						}
+					}
+
+					res
+				}
+			};
+
+			// Only bubble up fatal errors, but log all of them.
+			if let Err(e) = res {
+				log_error(Err(e))?;
+			}
+		}
+	}
+}
+
+// A simple thread pool implementation using `spawn_blocking` threads.
+struct ThreadPoolBuilder;
+
+const MAX_THREADS: NonZeroUsize = match NonZeroUsize::new(4) {
+	Some(max_threads) => max_threads,
+	None => panic!("MAX_THREADS must be non-zero"),
+};
+
+impl ThreadPoolBuilder {
+	// Creates a pool of `size` workers, where 1 <= `size` <= `MAX_THREADS`.
+	//
+	// Each worker is created by `spawn_blocking` and takes the receiver side of a channel
+	// while all of the senders are returned to the caller. Each worker runs `erasure_task_thread`
+	// that polls the `Receiver` for an `ErasureTask` which is expected to be CPU intensive. The
+	// larger the input (more or larger chunks/availability data), the more CPU cycles will be
+	// spent.
+	//
+	// For example, for 32KB PoVs, we'd expect re-encode to eat as much as 90ms and 500ms for
+	// 2.5MiB.
+	//
+	// After executing such a task, the worker sends the response via a provided `oneshot` sender.
+	//
+	// The caller is responsible for routing work to the workers.
+	#[overseer::contextbounds(AvailabilityRecovery, prefix = self::overseer)]
+	pub fn build<Context>(
+		size: NonZeroUsize,
+		metrics: Metrics,
+		ctx: &mut Context,
+	) -> Vec<futures::channel::mpsc::Sender<ErasureTask>> {
+		// At least 1 task, at most `MAX_THREADS.
+		let size = std::cmp::min(size, MAX_THREADS);
+		let mut senders = Vec::new();
+
+		for index in 0..size.into() {
+			let (tx, rx) = futures::channel::mpsc::channel(8);
+			senders.push(tx);
+
+			if let Err(e) = ctx
+				.spawn_blocking("erasure-task", Box::pin(erasure_task_thread(metrics.clone(), rx)))
+			{
+				gum::warn!(
+					target: LOG_TARGET,
+					err = ?e,
+					index,
+					"Failed to spawn a erasure task",
+				);
+			}
+		}
+		senders
+	}
+}
+
+// Handles CPU intensive operation on a dedicated blocking thread.
+async fn erasure_task_thread(
+	metrics: Metrics,
+	mut ingress: futures::channel::mpsc::Receiver<ErasureTask>,
+) {
+	loop {
+		match ingress.next().await {
+			Some(ErasureTask::Reconstruct(n_validators, chunks, sender)) => {
+				let _ = sender.send(pezkuwi_erasure_coding::reconstruct_v1(
+					n_validators,
+					chunks.iter().map(|(c_index, chunk)| {
+						(
+							&chunk[..],
+							usize::try_from(c_index.0)
+								.expect("usize is at least u32 bytes on all modern targets."),
+						)
+					}),
+				));
+			},
+			Some(ErasureTask::Reencode(n_validators, root, available_data, sender)) => {
+				let metrics = metrics.clone();
+
+				let maybe_data = if reconstructed_data_matches_root(
+					n_validators,
+					&root,
+					&available_data,
+					&metrics,
+				) {
+					Some(available_data)
+				} else {
+					None
+				};
+
+				let _ = sender.send(maybe_data);
+			},
+			None => {
+				gum::trace!(
+					target: LOG_TARGET,
+					"Erasure task channel closed. Node shutting down ?",
+				);
+				break;
+			},
+		}
+
+		// In benchmarks this is a very hot loop not yielding at all.
+		// To update CPU metrics for the task we need to yield.
+		#[cfg(feature = "subsystem-benchmarks")]
+		tokio::task::yield_now().await;
+	}
+}