Request based availability distribution (#2423)

* WIP * availability distribution, still very wip. Work on the requesting side of things. * Some docs on what I intend to do. * Checkpoint of session cache implementation as I will likely replace it with something smarter. * More work, mostly on cache and getting things to type check. * Only derive MallocSizeOf and Debug for std. * availability-distribution: Cache feature complete. * Sketch out logic in `FetchTask` for actual fetching. - Compile fixes. - Cleanup. * Format cleanup. * More format fixes. * Almost feature complete `fetch_task`. Missing: - Check for cancel - Actual querying of peer ids. * Finish FetchTask so far. * Directly use AuthorityDiscoveryId in protocol and cache. * Resolve `AuthorityDiscoveryId` on sending requests. * Rework fetch_task - also make it impossible to check the wrong chunk index. - Export needed function in validator_discovery. * From<u32> implementation for `ValidatorIndex`. * Fixes and more integration work. * Make session cache proper lru cache. * Use proper lru cache. * Requester finished. * ProtocolState -> Requester Also make sure to not fetch our own chunk. * Cleanup + fixes. * Remove unused functions - FetchTask::is_finished - SessionCache::fetch_session_info * availability-distribution responding side. * Cleanup + Fixes. * More fixes. * More fixes. adder-collator is running! * Some docs. * Docs. * Fix reporting of bad guys. * Fix tests * Make all tests compile. * Fix test. * Cleanup + get rid of some warnings. * state -> requester * Mostly doc fixes. * Fix test suite. * Get rid of now redundant message types. * WIP * Rob's review remarks. * Fix test suite. * core.relay_parent -> leaf for session request. * Style fix. * Decrease request timeout. * Cleanup obsolete errors. * Metrics + don't fail on non fatal errors. * requester.rs -> requester/mod.rs * Panic on invalid BadValidator report. * Fix indentation. * Use typed default timeout constant. * Make channel size 0, as each sender gets one slot anyways. * Fix incorrect metrics initialization. * Fix build after merge. * More fixes. * Hopefully valid metrics names. * Better metrics names. * Some tests that already work. * Slightly better docs. * Some more tests. * Fix network bridge test.
2026-04-27 23:18:01 +00:00 · 2021-02-26 18:58:07 +01:00
parent 241b1f12a7
commit 48409e5548
45 changed files with 2037 additions and 1523 deletions
@@ -0,0 +1,89 @@
+// Copyright 2021 Parity Technologies (UK) Ltd.
+// This file is part of Polkadot.
+
+// Polkadot is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Polkadot is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.
+//
+
+//! Error handling related code and Error/Result definitions.
+
+use thiserror::Error;
+
+use futures::channel::oneshot;
+
+use polkadot_node_subsystem_util::Error as UtilError;
+use polkadot_primitives::v1::SessionIndex;
+use polkadot_subsystem::{errors::RuntimeApiError, SubsystemError};
+
+/// Errors of this subsystem.
+#[derive(Debug, Error)]
+pub enum Error {
+	#[error("Response channel to obtain QueryChunk failed")]
+	QueryChunkResponseChannel(#[source] oneshot::Canceled),
+
+	#[error("Receive channel closed")]
+	IncomingMessageChannel(#[source] SubsystemError),
+
+	/// Some request to utility functions failed.
+	#[error("Runtime request failed")]
+	UtilRequest(#[source] UtilError),
+
+	/// Some request to the runtime failed.
+	#[error("Runtime request failed")]
+	RuntimeRequestCanceled(#[source] oneshot::Canceled),
+
+	/// Some request to the runtime failed.
+	#[error("Runtime request failed")]
+	RuntimeRequest(#[source] RuntimeApiError),
+
+	/// We tried fetching a session which was not available.
+	#[error("No such session")]
+	NoSuchSession(SessionIndex),
+
+	/// Spawning a running task failed.
+	#[error("Spawning subsystem task failed")]
+	SpawnTask(#[source] SubsystemError),
+
+	/// We tried accessing a session that was not cached.
+	#[error("Session is not cached.")]
+	NoSuchCachedSession,
+
+	/// Requester stream exhausted.
+	#[error("Erasure chunk requester stream exhausted")]
+	RequesterExhausted,
+
+	/// Sending response failed.
+	#[error("Sending a request's response failed.")]
+	SendResponse,
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+impl From<SubsystemError> for Error {
+	fn from(err: SubsystemError) -> Self {
+		Self::IncomingMessageChannel(err)
+	}
+}
+
+/// Receive a response from a runtime request and convert errors.
+pub(crate) async fn recv_runtime<V>(
+	r: std::result::Result<
+		oneshot::Receiver<std::result::Result<V, RuntimeApiError>>,
+		UtilError,
+	>,
+) -> Result<V> {
+	r.map_err(Error::UtilRequest)?
+		.await
+		.map_err(Error::RuntimeRequestCanceled)?
+		.map_err(Error::RuntimeRequest)
+}
@@ -0,0 +1,117 @@
+// Copyright 2021 Parity Technologies (UK) Ltd.
+// This file is part of Polkadot.
+
+// Polkadot is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Polkadot is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.
+
+use polkadot_node_subsystem_util::metrics::prometheus::{Counter, U64, Registry, PrometheusError, CounterVec, Opts};
+use polkadot_node_subsystem_util::metrics::prometheus;
+use polkadot_node_subsystem_util::metrics;
+
+/// Label for success counters.
+pub const SUCCEEDED: &'static str = "succeeded";
+
+/// Label for fail counters.
+pub const FAILED: &'static str = "failed";
+
+/// Label for chunks that could not be served, because they were not available.
+pub const NOT_FOUND: &'static str = "not-found";
+
+/// Availability Distribution metrics.
+#[derive(Clone, Default)]
+pub struct Metrics(Option<MetricsInner>);
+
+
+#[derive(Clone)]
+struct MetricsInner {
+	/// Number of chunks fetched.
+	///
+	/// Note: The failed count gets incremented, when we were not able to fetch the chunk at all.
+	/// For times, where we failed downloading, but succeeded on the next try (with different
+	/// backers), see `retries`.
+	fetched_chunks: CounterVec<U64>,
+
+	/// Number of chunks served.
+	///
+	/// Note: Right now, `Succeeded` gets incremented whenever we were able to successfully respond
+	/// to a chunk request. This includes `NoSuchChunk` responses.
+	served_chunks: CounterVec<U64>,
+
+	/// Number of times our first set of validators did not provide the needed chunk and we had to
+	/// query further validators.
+	retries: Counter<U64>,
+}
+
+impl Metrics {
+	/// Create new dummy metrics, not reporting anything.
+	pub fn new_dummy() -> Self {
+		Metrics(None)
+	}
+
+	/// Increment counter on fetched labels.
+	pub fn on_fetch(&self, label: &'static str) {
+		if let Some(metrics) = &self.0 {
+			metrics.fetched_chunks.with_label_values(&[label]).inc()
+		}
+	}
+
+	/// Increment counter on served chunks.
+	pub fn on_served(&self, label: &'static str) {
+		if let Some(metrics) = &self.0 {
+			metrics.served_chunks.with_label_values(&[label]).inc()
+		}
+	}
+
+	/// Increment retry counter.
+	pub fn on_retry(&self) {
+		if let Some(metrics) = &self.0 {
+			metrics.retries.inc()
+		}
+	}
+}
+
+impl metrics::Metrics for Metrics {
+	fn try_register(registry: &Registry) -> Result<Self, PrometheusError> {
+		let metrics = MetricsInner {
+			fetched_chunks: prometheus::register(
+				CounterVec::new(
+					Opts::new(
+						"parachain_fetched_chunks_total",
+						"Total number of fetched chunks.",
+					),
+					&["success"]
+				)?,
+				registry,
+			)?,
+			served_chunks: prometheus::register(
+				CounterVec::new(
+					Opts::new(
+						"parachain_served_chunks_total",
+						"Total number of chunks served by this backer.",
+					),
+					&["success"]
+				)?,
+				registry,
+			)?,
+			retries: prometheus::register(
+				Counter::new(
+					"parachain_fetch_retries_total",
+					"Number of times we did not succeed in fetching a chunk and needed to try more backers.",
+				)?,
+				registry,
+			)?,
+		};
+		Ok(Metrics(Some(metrics)))
+	}
+}
+
@@ -0,0 +1,421 @@
+// Copyright 2021 Parity Technologies (UK) Ltd.
+// This file is part of Polkadot.
+
+// Polkadot is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Polkadot is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.
+
+use std::collections::HashSet;
+
+use futures::channel::mpsc;
+use futures::channel::oneshot;
+use futures::future::select;
+use futures::{FutureExt, SinkExt};
+
+use polkadot_erasure_coding::branch_hash;
+use polkadot_node_network_protocol::request_response::{
+	request::{OutgoingRequest, RequestError, Requests},
+	v1::{AvailabilityFetchingRequest, AvailabilityFetchingResponse},
+};
+use polkadot_primitives::v1::{
+	AuthorityDiscoveryId, BlakeTwo256, ErasureChunk, GroupIndex, Hash, HashT, OccupiedCore,
+	SessionIndex,
+};
+use polkadot_subsystem::messages::{
+	AllMessages, AvailabilityStoreMessage, NetworkBridgeMessage,
+};
+use polkadot_subsystem::SubsystemContext;
+
+use crate::{
+	error::{Error, Result},
+	session_cache::{BadValidators, SessionInfo},
+	LOG_TARGET,
+	metrics::{Metrics, SUCCEEDED, FAILED},
+};
+
+#[cfg(test)]
+mod tests;
+
+/// Configuration for a `FetchTask`
+///
+/// This exists to separate preparation of a `FetchTask` from actual starting it, which is
+/// beneficial as this allows as for taking session info by reference.
+pub struct FetchTaskConfig {
+	prepared_running: Option<RunningTask>,
+	live_in: HashSet<Hash>,
+}
+
+/// Information about a task fetching an erasure chunk.
+pub struct FetchTask {
+	/// For what relay parents this task is relevant.
+	///
+	/// In other words, for which relay chain parents this candidate is considered live.
+	/// This is updated on every `ActiveLeavesUpdate` and enables us to know when we can safely
+	/// stop keeping track of that candidate/chunk.
+	live_in: HashSet<Hash>,
+
+	/// We keep the task around in until `live_in` becomes empty, to make
+	/// sure we won't re-fetch an already fetched candidate.
+	state: FetchedState,
+}
+
+/// State of a particular candidate chunk fetching process.
+enum FetchedState {
+	/// Chunk fetch has started.
+	///
+	/// Once the contained `Sender` is dropped, any still running task will be canceled.
+	Started(oneshot::Sender<()>),
+	/// All relevant live_in have been removed, before we were able to get our chunk.
+	Canceled,
+}
+
+/// Messages sent from `FetchTask`s to be handled/forwarded.
+pub enum FromFetchTask {
+	/// Message to other subsystem.
+	Message(AllMessages),
+
+	/// Concluded with result.
+	///
+	/// In case of `None` everything was fine, in case of `Some`, some validators in the group
+	/// did not serve us our chunk as expected.
+	Concluded(Option<BadValidators>),
+}
+
+/// Information a running task needs.
+struct RunningTask {
+	/// For what session we have been spawned.
+	session_index: SessionIndex,
+
+	/// Index of validator group to fetch the chunk from.
+	///
+	/// Needef for reporting bad validators.
+	group_index: GroupIndex,
+
+	/// Validators to request the chunk from.
+	///
+	/// This vector gets drained during execution of the task (it will be empty afterwards).
+	group: Vec<AuthorityDiscoveryId>,
+
+	/// The request to send.
+	request: AvailabilityFetchingRequest,
+
+	/// Root hash, for verifying the chunks validity.
+	erasure_root: Hash,
+
+	/// Relay parent of the candidate to fetch.
+	relay_parent: Hash,
+
+	/// Sender for communicating with other subsystems and reporting results.
+	sender: mpsc::Sender<FromFetchTask>,
+	
+	/// Prometheues metrics for reporting results.
+	metrics: Metrics,
+}
+
+impl FetchTaskConfig {
+	/// Create a new configuration for a [`FetchTask`].
+	///
+	/// The result of this function can be passed into [`FetchTask::start`].
+	pub fn new(
+		leaf: Hash,
+		core: &OccupiedCore,
+		sender: mpsc::Sender<FromFetchTask>,
+		metrics: Metrics,
+		session_info: &SessionInfo,
+	) -> Self {
+		let live_in = vec![leaf].into_iter().collect();
+
+		// Don't run tasks for our backing group:
+		if session_info.our_group == core.group_responsible {
+			return FetchTaskConfig {
+				live_in,
+				prepared_running: None,
+			};
+		}
+
+		let prepared_running = RunningTask {
+			session_index: session_info.session_index,
+			group_index: core.group_responsible,
+			group: session_info.validator_groups.get(core.group_responsible.0 as usize)
+				.expect("The responsible group of a candidate should be available in the corresponding session. qed.")
+				.clone(),
+			request: AvailabilityFetchingRequest {
+				candidate_hash: core.candidate_hash,
+				index: session_info.our_index,
+			},
+			erasure_root: core.candidate_descriptor.erasure_root,
+			relay_parent: core.candidate_descriptor.relay_parent,
+			metrics,
+			sender,
+		};
+		FetchTaskConfig {
+			live_in,
+			prepared_running: Some(prepared_running),
+		}
+	}
+}
+
+impl FetchTask {
+	/// Start fetching a chunk.
+	///
+	/// A task handling the fetching of the configured chunk will be spawned.
+	pub async fn start<Context>(config: FetchTaskConfig, ctx: &mut Context) -> Result<Self>
+	where
+		Context: SubsystemContext,
+	{
+		let FetchTaskConfig {
+			prepared_running,
+			live_in,
+		} = config;
+
+		if let Some(running) = prepared_running {
+			let (handle, kill) = oneshot::channel();
+
+			ctx.spawn("chunk-fetcher", running.run(kill).boxed())
+				.await
+				.map_err(|e| Error::SpawnTask(e))?;
+
+			Ok(FetchTask {
+				live_in,
+				state: FetchedState::Started(handle),
+			})
+		} else {
+			Ok(FetchTask {
+				live_in,
+				state: FetchedState::Canceled,
+			})
+		}
+	}
+
+	/// Add the given leaf to the relay parents which are making this task relevant.
+	///
+	/// This is for book keeping, so we know we are already fetching a given chunk.
+	pub fn add_leaf(&mut self, leaf: Hash) {
+		self.live_in.insert(leaf);
+	}
+
+	/// Remove leaves and cancel the task, if it was the last one and the task has still been
+	/// fetching.
+	pub fn remove_leaves(&mut self, leaves: &HashSet<Hash>) {
+		self.live_in.difference(leaves);
+		if self.live_in.is_empty() && !self.is_finished() {
+			self.state = FetchedState::Canceled
+		}
+	}
+
+	/// Whether or not there are still relay parents around with this candidate pending
+	/// availability.
+	pub fn is_live(&self) -> bool {
+		!self.live_in.is_empty()
+	}
+
+	/// Whether or not this task can be considered finished.
+	///
+	/// That is, it is either canceled, succeeded or failed.
+	pub fn is_finished(&self) -> bool {
+		match &self.state {
+			FetchedState::Canceled => true,
+			FetchedState::Started(sender) => sender.is_canceled(),
+		}
+	}
+}
+
+/// Things that can go wrong in task execution.
+#[derive(Debug)]
+enum TaskError {
+	/// The peer failed to deliver a correct chunk for some reason (has been reported as
+	/// appropriate).
+	PeerError,
+	/// This very node is seemingly shutting down (sending of message failed).
+	ShuttingDown,
+}
+
+impl RunningTask {
+	async fn run(self, kill: oneshot::Receiver<()>) {
+		// Wait for completion/or cancel.
+		let run_it = self.run_inner();
+		futures::pin_mut!(run_it);
+		let _ = select(run_it, kill).await;
+	}
+
+	/// Fetch and store chunk.
+	///
+	/// Try validators in backing group in order.
+	async fn run_inner(mut self) {
+		let mut bad_validators = Vec::new();
+		let mut label = FAILED;
+		let mut count: u32 = 0;
+		// Try validators in reverse order:
+		while let Some(validator) = self.group.pop() {
+			// Report retries:
+			if count > 0 {
+				self.metrics.on_retry();
+			}
+			count +=1;
+
+			// Send request:
+			let resp = match self.do_request(&validator).await {
+				Ok(resp) => resp,
+				Err(TaskError::ShuttingDown) => {
+					tracing::info!(
+						target: LOG_TARGET,
+						"Node seems to be shutting down, canceling fetch task"
+					);
+					self.metrics.on_fetch(FAILED);
+					return
+				}
+				Err(TaskError::PeerError) => {
+					bad_validators.push(validator);
+					continue
+				}
+			};
+			let chunk = match resp {
+				AvailabilityFetchingResponse::Chunk(resp) => {
+					resp.recombine_into_chunk(&self.request)
+				}
+				AvailabilityFetchingResponse::NoSuchChunk => {
+					tracing::debug!(
+						target: LOG_TARGET,
+						validator = ?validator,
+						"Validator did not have our chunk"
+					);
+					bad_validators.push(validator);
+					continue
+				}
+			};
+
+			// Data genuine?
+			if !self.validate_chunk(&validator, &chunk) {
+				bad_validators.push(validator);
+				continue;
+			}
+
+			// Ok, let's store it and be happy:
+			self.store_chunk(chunk).await;
+			label = SUCCEEDED;
+			break;
+		}
+		self.metrics.on_fetch(label);
+		self.conclude(bad_validators).await;
+	}
+
+	/// Do request and return response, if successful.
+	async fn do_request(
+		&mut self,
+		validator: &AuthorityDiscoveryId,
+	) -> std::result::Result<AvailabilityFetchingResponse, TaskError> {
+		let (full_request, response_recv) =
+			OutgoingRequest::new(validator.clone(), self.request);
+		let requests = Requests::AvailabilityFetching(full_request);
+
+		self.sender
+			.send(FromFetchTask::Message(AllMessages::NetworkBridge(
+				NetworkBridgeMessage::SendRequests(vec![requests]),
+			)))
+			.await
+			.map_err(|_| TaskError::ShuttingDown)?;
+
+		match response_recv.await {
+			Ok(resp) => Ok(resp),
+			Err(RequestError::InvalidResponse(err)) => {
+				tracing::warn!(
+					target: LOG_TARGET,
+					origin= ?validator,
+					err= ?err,
+					"Peer sent us invalid erasure chunk data"
+				);
+				Err(TaskError::PeerError)
+			}
+			Err(RequestError::NetworkError(err)) => {
+				tracing::warn!(
+					target: LOG_TARGET,
+					origin= ?validator,
+					err= ?err,
+					"Some network error occurred when fetching erasure chunk"
+				);
+				Err(TaskError::PeerError)
+			}
+			Err(RequestError::Canceled(oneshot::Canceled)) => {
+				tracing::warn!(target: LOG_TARGET,
+							   origin= ?validator,
+							   "Erasure chunk request got canceled");
+				Err(TaskError::PeerError)
+			}
+		}
+	}
+
+	fn validate_chunk(&self, validator: &AuthorityDiscoveryId, chunk: &ErasureChunk) -> bool {
+		let anticipated_hash =
+			match branch_hash(&self.erasure_root, &chunk.proof, chunk.index.0 as usize) {
+				Ok(hash) => hash,
+				Err(e) => {
+					tracing::warn!(
+						target: LOG_TARGET,
+						candidate_hash = ?self.request.candidate_hash,
+						origin = ?validator,
+						error = ?e,
+						"Failed to calculate chunk merkle proof",
+					);
+					return false;
+				}
+			};
+		let erasure_chunk_hash = BlakeTwo256::hash(&chunk.chunk);
+		if anticipated_hash != erasure_chunk_hash {
+			tracing::warn!(target: LOG_TARGET, origin = ?validator,  "Received chunk does not match merkle tree");
+			return false;
+		}
+		true
+	}
+
+	/// Store given chunk and log any error.
+	async fn store_chunk(&mut self, chunk: ErasureChunk) {
+		let (tx, rx) = oneshot::channel();
+		let r = self
+			.sender
+			.send(FromFetchTask::Message(AllMessages::AvailabilityStore(
+				AvailabilityStoreMessage::StoreChunk {
+					candidate_hash: self.request.candidate_hash,
+					relay_parent: self.relay_parent,
+					chunk,
+					tx,
+				},
+			)))
+			.await;
+		if let Err(err) = r {
+			tracing::error!(target: LOG_TARGET, err= ?err, "Storing erasure chunk failed, system shutting down?");
+		}
+
+		if let Err(oneshot::Canceled) = rx.await {
+			tracing::error!(target: LOG_TARGET, "Storing erasure chunk failed");
+		}
+	}
+
+	/// Tell subsystem we are done.
+	async fn conclude(&mut self, bad_validators: Vec<AuthorityDiscoveryId>) {
+		let payload = if bad_validators.is_empty() {
+			None
+		} else {
+			Some(BadValidators {
+				session_index: self.session_index,
+				group_index: self.group_index,
+				bad_validators,
+			})
+		};
+		if let Err(err) = self.sender.send(FromFetchTask::Concluded(payload)).await {
+			tracing::warn!(
+				target: LOG_TARGET,
+				err= ?err,
+				"Sending concluded message for task failed"
+			);
+		}
+	}
+}
@@ -0,0 +1,315 @@
+// Copyright 2021 Parity Technologies (UK) Ltd.
+// This file is part of Polkadot.
+
+// Polkadot is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Polkadot is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.
+
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use parity_scale_codec::Encode;
+
+use futures::channel::{mpsc, oneshot};
+use futures::{executor, Future, FutureExt, StreamExt, select};
+use futures::task::{Poll, Context, noop_waker};
+
+use polkadot_erasure_coding::{obtain_chunks_v1 as obtain_chunks, branches};
+use sc_network as network;
+use sp_keyring::Sr25519Keyring;
+
+use polkadot_primitives::v1::{AvailableData, BlockData, CandidateHash, HeadData, PersistedValidationData, PoV, ValidatorIndex};
+use polkadot_node_network_protocol::request_response::v1;
+use polkadot_subsystem::messages::AllMessages;
+
+use crate::metrics::Metrics;
+use super::*;
+
+#[test]
+fn task_can_be_canceled() {
+	let (task, _rx) = get_test_running_task();
+	let (handle, kill) = oneshot::channel();
+	std::mem::drop(handle);
+	let running_task = task.run(kill);
+	futures::pin_mut!(running_task);
+	let waker = noop_waker();
+	let mut ctx = Context::from_waker(&waker);
+	assert!(running_task.poll(&mut ctx) == Poll::Ready(()), "Task is immediately finished");
+}
+
+/// Make sure task won't accept a chunk that has is invalid.
+#[test]
+fn task_does_not_accept_invalid_chunk() {
+	let (mut task, rx) = get_test_running_task();
+	let validators = vec![Sr25519Keyring::Alice.public().into()];
+	task.group = validators;
+	let test = TestRun {
+		chunk_responses:  {
+			let mut m = HashMap::new();
+			m.insert(
+				Sr25519Keyring::Alice.public().into(),
+				AvailabilityFetchingResponse::Chunk(
+					v1::ChunkResponse {
+						chunk: vec![1,2,3],
+						proof: vec![vec![9,8,2], vec![2,3,4]],
+					}
+				)
+			);
+			m
+		},
+		valid_chunks: HashSet::new(),
+	};
+	test.run(task, rx);
+}
+
+#[test]
+fn task_stores_valid_chunk() {
+	let (mut task, rx) = get_test_running_task();
+	let (root_hash, chunk) = get_valid_chunk_data();
+	task.erasure_root = root_hash;
+	task.request.index = chunk.index;
+
+	let validators = vec![Sr25519Keyring::Alice.public().into()];
+	task.group = validators;
+
+	let test = TestRun {
+		chunk_responses:  {
+			let mut m = HashMap::new();
+			m.insert(
+				Sr25519Keyring::Alice.public().into(),
+				AvailabilityFetchingResponse::Chunk(
+					v1::ChunkResponse {
+						chunk: chunk.chunk.clone(),
+						proof: chunk.proof,
+					}
+				)
+			);
+			m
+		},
+		valid_chunks: {
+			let mut s = HashSet::new();
+			s.insert(chunk.chunk);
+			s
+		},
+	};
+	test.run(task, rx);
+}
+
+#[test]
+fn task_does_not_accept_wrongly_indexed_chunk() {
+	let (mut task, rx) = get_test_running_task();
+	let (root_hash, chunk) = get_valid_chunk_data();
+	task.erasure_root = root_hash;
+	task.request.index = ValidatorIndex(chunk.index.0+1);
+
+	let validators = vec![Sr25519Keyring::Alice.public().into()];
+	task.group = validators;
+
+	let test = TestRun {
+		chunk_responses:  {
+			let mut m = HashMap::new();
+			m.insert(
+				Sr25519Keyring::Alice.public().into(),
+				AvailabilityFetchingResponse::Chunk(
+					v1::ChunkResponse {
+						chunk: chunk.chunk.clone(),
+						proof: chunk.proof,
+					}
+				)
+			);
+			m
+		},
+		valid_chunks: HashSet::new(),
+	};
+	test.run(task, rx);
+}
+
+/// Task stores chunk, if there is at least one validator having a valid chunk.
+#[test]
+fn task_stores_valid_chunk_if_there_is_one() {
+	let (mut task, rx) = get_test_running_task();
+	let (root_hash, chunk) = get_valid_chunk_data();
+	task.erasure_root = root_hash;
+	task.request.index = chunk.index;
+
+	let validators = [
+			// Only Alice has valid chunk - should succeed, even though she is tried last.
+			Sr25519Keyring::Alice,
+			Sr25519Keyring::Bob, Sr25519Keyring::Charlie,
+			Sr25519Keyring::Dave, Sr25519Keyring::Eve,
+		]
+		.iter().map(|v| v.public().into()).collect::<Vec<_>>();
+	task.group = validators;
+
+	let test = TestRun {
+		chunk_responses:  {
+			let mut m = HashMap::new();
+			m.insert(
+				Sr25519Keyring::Alice.public().into(),
+				AvailabilityFetchingResponse::Chunk(
+					v1::ChunkResponse {
+						chunk: chunk.chunk.clone(),
+						proof: chunk.proof,
+					}
+				)
+			);
+			m.insert(
+				Sr25519Keyring::Bob.public().into(),
+				AvailabilityFetchingResponse::NoSuchChunk
+			);
+			m.insert(
+				Sr25519Keyring::Charlie.public().into(),
+				AvailabilityFetchingResponse::Chunk(
+					v1::ChunkResponse {
+						chunk: vec![1,2,3],
+						proof: vec![vec![9,8,2], vec![2,3,4]],
+					}
+				)
+			);
+
+			m
+		},
+		valid_chunks: {
+			let mut s = HashSet::new();
+			s.insert(chunk.chunk);
+			s
+		},
+	};
+	test.run(task, rx);
+}
+
+struct TestRun {
+	/// Response to deliver for a given validator index.
+	/// None means, answer with NetworkError.
+	chunk_responses: HashMap<AuthorityDiscoveryId, AvailabilityFetchingResponse>,
+	/// Set of chunks that should be considered valid:
+	valid_chunks: HashSet<Vec<u8>>,
+}
+
+
+impl TestRun {
+	fn run(self, task: RunningTask, rx: mpsc::Receiver<FromFetchTask>) {
+		sp_tracing::try_init_simple();
+		let mut rx = rx.fuse();
+		let task = task.run_inner().fuse();
+		futures::pin_mut!(task);
+		executor::block_on(async {
+			let mut end_ok = false;
+			loop {
+				let msg = select!(
+					from_task = rx.next() => {
+						match from_task {
+							Some(msg) => msg,
+							None => break,
+						}
+					},
+					() = task =>
+						break,
+				);
+				match msg {
+					FromFetchTask::Concluded(_) => break,
+					FromFetchTask::Message(msg) => 
+						end_ok = self.handle_message(msg).await,
+				}
+			}
+			if !end_ok {
+				panic!("Task ended prematurely (failed to store valid chunk)!");
+			}
+		});
+	}
+
+	/// Returns true, if after processing of the given message it would be ok for the stream to
+	/// end.
+	async fn handle_message(&self, msg: AllMessages) -> bool {
+		match msg {
+			AllMessages::NetworkBridge(NetworkBridgeMessage::SendRequests(reqs)) => {
+				let mut valid_responses = 0;
+				for req in reqs {
+					let req = match req {
+						Requests::AvailabilityFetching(req) => req,
+					};
+					let response = self.chunk_responses.get(&req.peer)
+						.ok_or(network::RequestFailure::Refused);
+
+					if let Ok(AvailabilityFetchingResponse::Chunk(resp)) = &response {
+						if self.valid_chunks.contains(&resp.chunk) {
+							valid_responses += 1;
+						}
+					}
+					req.pending_response.send(response.map(Encode::encode))
+						.expect("Sending response should succeed");
+				}
+				return (valid_responses == 0) && self.valid_chunks.is_empty()
+			}
+			AllMessages::AvailabilityStore(
+				AvailabilityStoreMessage::StoreChunk { chunk, tx, .. }
+			) => {
+				assert!(self.valid_chunks.contains(&chunk.chunk));
+				tx.send(Ok(())).expect("Answering fetching task should work");
+				return true
+			}
+			_ => {
+				tracing::debug!(target: LOG_TARGET, "Unexpected message");
+				return false
+			}
+		}
+	}
+}
+
+/// Get a `RunningTask` filled with dummy values.
+fn get_test_running_task() -> (RunningTask, mpsc::Receiver<FromFetchTask>) {
+	let (tx,rx) = mpsc::channel(0);
+
+	(
+		RunningTask {
+			session_index: 0,
+			group_index: GroupIndex(0),
+			group: Vec::new(),
+			request: AvailabilityFetchingRequest {
+				candidate_hash: CandidateHash([43u8;32].into()),
+				index: ValidatorIndex(0),
+			},
+			erasure_root: Hash::repeat_byte(99),
+			relay_parent: Hash::repeat_byte(71),
+			sender: tx,
+			metrics: Metrics::new_dummy(),
+		},
+		rx
+	)
+}
+
+fn get_valid_chunk_data() -> (Hash, ErasureChunk) {
+	let fake_validator_count = 10;
+	let persisted = PersistedValidationData {
+		parent_head: HeadData(vec![7, 8, 9]),
+		relay_parent_number: Default::default(),
+		max_pov_size: 1024,
+		relay_parent_storage_root: Default::default(),
+	};
+	let pov_block = PoV {
+		block_data: BlockData(vec![45, 46, 47]),
+	};
+	let available_data = AvailableData {
+		validation_data: persisted, pov: Arc::new(pov_block),
+	};
+	let chunks = obtain_chunks(fake_validator_count, &available_data).unwrap();
+	let branches = branches(chunks.as_ref());
+	let root = branches.root();
+	let chunk = branches.enumerate()
+			.map(|(index, (proof, chunk))| ErasureChunk {
+				chunk: chunk.to_vec(),
+				index: ValidatorIndex(index as _),
+				proof,
+			})
+			.next().expect("There really should be 10 chunks.");
+	(root, chunk)
+}
@@ -0,0 +1,236 @@
+// Copyright 2021 Parity Technologies (UK) Ltd.
+// This file is part of Polkadot.
+
+// Polkadot is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Polkadot is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.
+
+//! Requester takes care of requesting erasure chunks for candidates that are pending
+//! availability.
+
+use std::collections::{
+	hash_map::{Entry, HashMap},
+	hash_set::HashSet,
+};
+use std::iter::IntoIterator;
+use std::pin::Pin;
+use std::sync::Arc;
+
+use futures::{
+	channel::mpsc,
+	task::{Context, Poll},
+	Stream,
+};
+
+use sp_keystore::SyncCryptoStorePtr;
+
+use polkadot_node_subsystem_util::request_availability_cores_ctx;
+use polkadot_primitives::v1::{CandidateHash, CoreState, Hash, OccupiedCore};
+use polkadot_subsystem::{
+	messages::AllMessages, ActiveLeavesUpdate, jaeger, SubsystemContext,
+};
+
+use super::{error::recv_runtime, session_cache::SessionCache, Result, LOG_TARGET, Metrics};
+
+/// A task fetching a particular chunk.
+mod fetch_task;
+use fetch_task::{FetchTask, FetchTaskConfig, FromFetchTask};
+
+/// Requester takes care of requesting erasure chunks from backing groups and stores them in the
+/// av store.
+///
+/// It implements a stream that needs to be advanced for it making progress.
+pub struct Requester {
+	/// Candidates we need to fetch our chunk for.
+	///
+	/// We keep those around as long as a candidate is pending availability on some leaf, so we
+	/// won't fetch chunks multiple times.
+	fetches: HashMap<CandidateHash, FetchTask>,
+
+	/// Localized information about sessions we are currently interested in.
+	session_cache: SessionCache,
+
+	/// Sender to be cloned for `FetchTask`s.
+	tx: mpsc::Sender<FromFetchTask>,
+
+	/// Receive messages from `FetchTask`.
+	rx: mpsc::Receiver<FromFetchTask>,
+
+	/// Prometheus Metrics
+	metrics: Metrics,
+}
+
+impl Requester {
+	/// Create a new `Requester`.
+	///
+	/// You must feed it with `ActiveLeavesUpdate` via `update_fetching_heads` and make it progress
+	/// by advancing the stream.
+	pub fn new(keystore: SyncCryptoStorePtr, metrics: Metrics) -> Self {
+		// All we do is forwarding messages, no need to make this big.
+		// Each sender will get one slot, see
+		// [here](https://docs.rs/futures/0.3.13/futures/channel/mpsc/fn.channel.html).
+		let (tx, rx) = mpsc::channel(0);
+		Requester {
+			fetches: HashMap::new(),
+			session_cache: SessionCache::new(keystore),
+			tx,
+			rx,
+			metrics,
+		}
+	}
+	/// Update heads that need availability distribution.
+	///
+	/// For all active heads we will be fetching our chunks for availabilty distribution.
+	pub async fn update_fetching_heads<Context>(
+		&mut self,
+		ctx: &mut Context,
+		update: ActiveLeavesUpdate,
+	) -> Result<()>
+	where
+		Context: SubsystemContext,
+	{
+		let ActiveLeavesUpdate {
+			activated,
+			deactivated,
+		} = update;
+		// Order important! We need to handle activated, prior to deactivated, otherwise we might
+		// cancel still needed jobs.
+		self.start_requesting_chunks(ctx, activated.into_iter())
+			.await?;
+		self.stop_requesting_chunks(deactivated.into_iter());
+		Ok(())
+	}
+
+	/// Start requesting chunks for newly imported heads.
+	async fn start_requesting_chunks<Context>(
+		&mut self,
+		ctx: &mut Context,
+		new_heads: impl Iterator<Item = (Hash, Arc<jaeger::Span>)>,
+	) -> Result<()>
+	where
+		Context: SubsystemContext,
+	{
+		for (leaf, _) in new_heads {
+			let cores = query_occupied_cores(ctx, leaf).await?;
+			self.add_cores(ctx, leaf, cores).await?;
+		}
+		Ok(())
+	}
+
+	/// Stop requesting chunks for obsolete heads.
+	///
+	fn stop_requesting_chunks(&mut self, obsolete_leaves: impl Iterator<Item = Hash>) {
+		let obsolete_leaves: HashSet<_> = obsolete_leaves.collect();
+		self.fetches.retain(|_, task| {
+			task.remove_leaves(&obsolete_leaves);
+			task.is_live()
+		})
+	}
+
+	/// Add candidates corresponding for a particular relay parent.
+	///
+	/// Starting requests where necessary.
+	///
+	/// Note: The passed in `leaf` is not the same as CandidateDescriptor::relay_parent in the
+	/// given cores. The latter is the relay_parent this candidate considers its parent, while the
+	/// passed in leaf might be some later block where the candidate is still pending availability.
+	async fn add_cores<Context>(
+		&mut self,
+		ctx: &mut Context,
+		leaf: Hash,
+		cores: impl IntoIterator<Item = OccupiedCore>,
+	) -> Result<()>
+	where
+		Context: SubsystemContext,
+	{
+		for core in cores {
+			match self.fetches.entry(core.candidate_hash) {
+				Entry::Occupied(mut e) =>
+				// Just book keeping - we are already requesting that chunk:
+				{
+					e.get_mut().add_leaf(leaf);
+				}
+				Entry::Vacant(e) => {
+					let tx = self.tx.clone();
+					let metrics = self.metrics.clone();
+
+					let task_cfg = self
+						.session_cache
+						.with_session_info(
+							ctx,
+							// We use leaf here, as relay_parent must be in the same session as the
+							// leaf. (Cores are dropped at session boundaries.) At the same time,
+							// only leaves are guaranteed to be fetchable by the state trie.
+							leaf,
+							|info| FetchTaskConfig::new(leaf, &core, tx, metrics, info),
+						)
+						.await?;
+
+					if let Some(task_cfg) = task_cfg {
+						e.insert(FetchTask::start(task_cfg, ctx).await?);
+					}
+					// Not a validator, nothing to do.
+				}
+			}
+		}
+		Ok(())
+	}
+}
+
+impl Stream for Requester {
+	type Item = AllMessages;
+
+	fn poll_next(
+		mut self: Pin<&mut Self>,
+		ctx: &mut Context,
+	) -> Poll<Option<AllMessages>> {
+		loop {
+			match Pin::new(&mut self.rx).poll_next(ctx) {
+				Poll::Ready(Some(FromFetchTask::Message(m))) =>
+					return Poll::Ready(Some(m)),
+				Poll::Ready(Some(FromFetchTask::Concluded(Some(bad_boys)))) => {
+					self.session_cache.report_bad_log(bad_boys);
+					continue
+				}
+				Poll::Ready(Some(FromFetchTask::Concluded(None))) =>
+					continue,
+				Poll::Ready(None) =>
+					return Poll::Ready(None),
+				Poll::Pending =>
+					return Poll::Pending,
+			}
+		}
+	}
+}
+
+/// Query all hashes and descriptors of candidates pending availability at a particular block.
+#[tracing::instrument(level = "trace", skip(ctx), fields(subsystem = LOG_TARGET))]
+async fn query_occupied_cores<Context>(
+	ctx: &mut Context,
+	relay_parent: Hash,
+) -> Result<Vec<OccupiedCore>>
+where
+	Context: SubsystemContext,
+{
+	let cores = recv_runtime(request_availability_cores_ctx(relay_parent, ctx).await).await?;
+
+	Ok(cores
+		.into_iter()
+		.filter_map(|core_state| {
+			if let CoreState::Occupied(occupied) = core_state {
+				Some(occupied)
+			} else {
+				None
+			}
+		})
+		.collect())
+}
@@ -0,0 +1,97 @@
+// Copyright 2021 Parity Technologies (UK) Ltd.
+// This file is part of Polkadot.
+
+// Polkadot is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Polkadot is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.
+
+//! Answer requests for availability chunks.
+
+use futures::channel::oneshot;
+
+use polkadot_node_network_protocol::request_response::{request::IncomingRequest, v1};
+use polkadot_primitives::v1::{CandidateHash, ErasureChunk, ValidatorIndex};
+use polkadot_subsystem::{
+	messages::{AllMessages, AvailabilityStoreMessage},
+	SubsystemContext,
+};
+
+use crate::error::{Error, Result};
+use crate::{LOG_TARGET, metrics::{Metrics, SUCCEEDED, FAILED, NOT_FOUND}};
+
+/// Variant of `answer_request` that does Prometheus metric and logging on errors.
+///
+/// Any errors of `answer_request` will simply be logged.
+pub async fn answer_request_log<Context>(
+	ctx: &mut Context,
+	req: IncomingRequest<v1::AvailabilityFetchingRequest>,
+	metrics: &Metrics,
+) -> ()
+where
+	Context: SubsystemContext,
+{
+	let res = answer_request(ctx, req).await;
+	match res {
+		Ok(result) =>
+			metrics.on_served(if result {SUCCEEDED} else {NOT_FOUND}),
+		Err(err) => {
+			tracing::warn!(
+				target: LOG_TARGET,
+				err= ?err,
+				"Serving chunk failed with error"
+			);
+			metrics.on_served(FAILED);
+		}
+	}
+}
+
+/// Answer an incoming chunk request by querying the av store.
+///
+/// Returns: Ok(true) if chunk was found and served.
+pub async fn answer_request<Context>(
+	ctx: &mut Context,
+	req: IncomingRequest<v1::AvailabilityFetchingRequest>,
+) -> Result<bool>
+where
+	Context: SubsystemContext,
+{
+	let chunk = query_chunk(ctx, req.payload.candidate_hash, req.payload.index).await?;
+
+	let result = chunk.is_some();
+
+	let response = match chunk {
+		None => v1::AvailabilityFetchingResponse::NoSuchChunk,
+		Some(chunk) => v1::AvailabilityFetchingResponse::Chunk(chunk.into()),
+	};
+
+	req.send_response(response).map_err(|_| Error::SendResponse)?;
+	Ok(result)
+}
+
+/// Query chunk from the availability store.
+#[tracing::instrument(level = "trace", skip(ctx), fields(subsystem = LOG_TARGET))]
+async fn query_chunk<Context>(
+	ctx: &mut Context,
+	candidate_hash: CandidateHash,
+	validator_index: ValidatorIndex,
+) -> Result<Option<ErasureChunk>>
+where
+	Context: SubsystemContext,
+{
+	let (tx, rx) = oneshot::channel();
+	ctx.send_message(AllMessages::AvailabilityStore(
+		AvailabilityStoreMessage::QueryChunk(candidate_hash, validator_index, tx),
+	))
+	.await;
+
+	rx.await.map_err(|e| Error::QueryChunkResponseChannel(e))
+}
@@ -0,0 +1,275 @@
+// Copyright 2021 Parity Technologies (UK) Ltd.
+// This file is part of Polkadot.
+
+// Polkadot is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Polkadot is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.
+
+use std::collections::HashSet;
+
+use lru::LruCache;
+use rand::{seq::SliceRandom, thread_rng};
+
+use sp_application_crypto::AppKey;
+use sp_core::crypto::Public;
+use sp_keystore::{CryptoStore, SyncCryptoStorePtr};
+
+use polkadot_node_subsystem_util::{
+	request_session_index_for_child_ctx, request_session_info_ctx,
+};
+use polkadot_primitives::v1::SessionInfo as GlobalSessionInfo;
+use polkadot_primitives::v1::{
+	AuthorityDiscoveryId, GroupIndex, Hash, SessionIndex, ValidatorId, ValidatorIndex,
+};
+use polkadot_subsystem::SubsystemContext;
+
+use super::{
+	error::{recv_runtime, Result},
+	Error,
+	LOG_TARGET,
+};
+
+/// Caching of session info as needed by availability distribution.
+///
+/// It should be ensured that a cached session stays live in the cache as long as we might need it.
+pub struct SessionCache {
+	/// Get the session index for a given relay parent.
+	///
+	/// We query this up to a 100 times per block, so caching it here without roundtrips over the
+	/// overseer seems sensible.
+	session_index_cache: LruCache<Hash, SessionIndex>,
+
+	/// Look up cached sessions by SessionIndex.
+	///
+	/// Note: Performance of fetching is really secondary here, but we need to ensure we are going
+	/// to get any existing cache entry, before fetching new information, as we should not mess up
+	/// the order of validators in `SessionInfo::validator_groups`. (We want live TCP connections
+	/// wherever possible.)
+	session_info_cache: LruCache<SessionIndex, SessionInfo>,
+
+	/// Key store for determining whether we are a validator and what `ValidatorIndex` we have.
+	keystore: SyncCryptoStorePtr,
+}
+
+/// Localized session information, tailored for the needs of availability distribution.
+#[derive(Clone)]
+pub struct SessionInfo {
+	/// The index of this session.
+	pub session_index: SessionIndex,
+
+	/// Validator groups of the current session.
+	///
+	/// Each group's order is randomized. This way we achieve load balancing when requesting
+	/// chunks, as the validators in a group will be tried in that randomized order. Each node
+	/// should arrive at a different order, therefore we distribute the load on individual
+	/// validators.
+	pub validator_groups: Vec<Vec<AuthorityDiscoveryId>>,
+
+	/// Information about ourself:
+	pub our_index: ValidatorIndex,
+
+	/// Remember to which group we belong, so we won't start fetching chunks for candidates with
+	/// our group being responsible. (We should have that chunk already.)
+	pub our_group: GroupIndex,
+}
+
+/// Report of bad validators.
+///
+/// Fetching tasks will report back validators that did not respond as expected, so we can re-order
+/// them.
+pub struct BadValidators {
+	/// The session index that was used.
+	pub session_index: SessionIndex,
+	/// The group, the not properly responding validators belong to.
+	pub group_index: GroupIndex,
+	/// The list of bad validators.
+	pub bad_validators: Vec<AuthorityDiscoveryId>,
+}
+
+impl SessionCache {
+	/// Create a new `SessionCache`.
+	pub fn new(keystore: SyncCryptoStorePtr) -> Self {
+		SessionCache {
+			// 5 relatively conservative, 1 to 2 should suffice:
+			session_index_cache: LruCache::new(5),
+			// We need to cache the current and the last session the most:
+			session_info_cache: LruCache::new(2),
+			keystore,
+		}
+	}
+
+	/// Tries to retrieve `SessionInfo` and calls `with_info` if successful.
+	///
+	/// If this node is not a validator, the function will return `None`.
+	///
+	/// Use this function over any `fetch_session_info` if all you need is a reference to
+	/// `SessionInfo`, as it avoids an expensive clone.
+	pub async fn with_session_info<Context, F, R>(
+		&mut self,
+		ctx: &mut Context,
+		parent: Hash,
+		with_info: F,
+	) -> Result<Option<R>>
+	where
+		Context: SubsystemContext,
+		F: FnOnce(&SessionInfo) -> R,
+	{
+		let session_index = match self.session_index_cache.get(&parent) {
+			Some(index) => *index,
+			None => {
+				let index =
+					recv_runtime(request_session_index_for_child_ctx(parent, ctx).await)
+						.await?;
+				self.session_index_cache.put(parent, index);
+				index
+			}
+		};
+
+		if let Some(info) = self.session_info_cache.get(&session_index) {
+			return Ok(Some(with_info(info)));
+		}
+
+		if let Some(info) = self
+			.query_info_from_runtime(ctx, parent, session_index)
+			.await?
+		{
+			let r = with_info(&info);
+			self.session_info_cache.put(session_index, info);
+			return Ok(Some(r));
+		}
+		Ok(None)
+	}
+
+	/// Variant of `report_bad` that never fails, but just logs errors.
+	///
+	/// Not being able to report bad validators is not fatal, so we should not shutdown the
+	/// subsystem on this.
+	pub fn report_bad_log(&mut self, report: BadValidators) {
+		if let Err(err) =  self.report_bad(report) {
+			tracing::warn!(
+				target: LOG_TARGET,
+				err= ?err,
+				"Reporting bad validators failed with error"
+			);
+		}
+	}
+
+	/// Make sure we try unresponsive or misbehaving validators last.
+	///
+	/// We assume validators in a group are tried in reverse order, so the reported bad validators
+	/// will be put at the beginning of the group.
+	#[tracing::instrument(level = "trace", skip(self, report), fields(subsystem = LOG_TARGET))]
+	pub fn report_bad(&mut self, report: BadValidators) -> Result<()> {
+		let session = self
+			.session_info_cache
+			.get_mut(&report.session_index)
+			.ok_or(Error::NoSuchCachedSession)?;
+		let group = session
+			.validator_groups
+			.get_mut(report.group_index.0 as usize)
+			.expect("A bad validator report must contain a valid group for the reported session. qed.");
+		let bad_set = report.bad_validators.iter().collect::<HashSet<_>>();
+
+		// Get rid of bad boys:
+		group.retain(|v| !bad_set.contains(v));
+
+		// We are trying validators in reverse order, so bad ones should be first:
+		let mut new_group = report.bad_validators;
+		new_group.append(group);
+		*group = new_group;
+		Ok(())
+	}
+
+	/// Query needed information from runtime.
+	///
+	/// We need to pass in the relay parent for our call to `request_session_info_ctx`. We should
+	/// actually don't need that: I suppose it is used for internal caching based on relay parents,
+	/// which we don't use here. It should not do any harm though.
+	async fn query_info_from_runtime<Context>(
+		&self,
+		ctx: &mut Context,
+		parent: Hash,
+		session_index: SessionIndex,
+	) -> Result<Option<SessionInfo>>
+	where
+		Context: SubsystemContext,
+	{
+		let GlobalSessionInfo {
+			validators,
+			discovery_keys,
+			mut validator_groups,
+			..
+		} = recv_runtime(request_session_info_ctx(parent, session_index, ctx).await)
+			.await?
+			.ok_or(Error::NoSuchSession(session_index))?;
+
+		if let Some(our_index) = self.get_our_index(validators).await {
+			// Get our group index:
+			let our_group = validator_groups
+				.iter()
+				.enumerate()
+				.find_map(|(i, g)| {
+					g.iter().find_map(|v| {
+						if *v == our_index {
+							Some(GroupIndex(i as u32))
+						} else {
+							None
+						}
+					})
+				})
+				.expect("Every validator should be in a validator group. qed.");
+
+			// Shuffle validators in groups:
+			let mut rng = thread_rng();
+			for g in validator_groups.iter_mut() {
+				g.shuffle(&mut rng)
+			}
+			// Look up `AuthorityDiscoveryId`s right away:
+			let validator_groups: Vec<Vec<_>> = validator_groups
+				.into_iter()
+				.map(|group| {
+					group
+						.into_iter()
+						.map(|index| {
+							discovery_keys.get(index.0 as usize)
+								.expect("There should be a discovery key for each validator of each validator group. qed.")
+								.clone()
+						})
+						.collect()
+				})
+				.collect();
+
+			let info = SessionInfo {
+				validator_groups,
+				our_index,
+				session_index,
+				our_group,
+			};
+			return Ok(Some(info));
+		}
+		return Ok(None);
+	}
+
+	/// Get our `ValidatorIndex`.
+	///
+	/// Returns: None if we are not a validator.
+	async fn get_our_index(&self, validators: Vec<ValidatorId>) -> Option<ValidatorIndex> {
+		for (i, v) in validators.iter().enumerate() {
+			if CryptoStore::has_keys(&*self.keystore, &[(v.to_raw_vec(), ValidatorId::ID)])
+				.await
+			{
+				return Some(ValidatorIndex(i as u32));
+			}
+		}
+		None
+	}
+}