Request based availability distribution (#2423)

* WIP * availability distribution, still very wip. Work on the requesting side of things. * Some docs on what I intend to do. * Checkpoint of session cache implementation as I will likely replace it with something smarter. * More work, mostly on cache and getting things to type check. * Only derive MallocSizeOf and Debug for std. * availability-distribution: Cache feature complete. * Sketch out logic in `FetchTask` for actual fetching. - Compile fixes. - Cleanup. * Format cleanup. * More format fixes. * Almost feature complete `fetch_task`. Missing: - Check for cancel - Actual querying of peer ids. * Finish FetchTask so far. * Directly use AuthorityDiscoveryId in protocol and cache. * Resolve `AuthorityDiscoveryId` on sending requests. * Rework fetch_task - also make it impossible to check the wrong chunk index. - Export needed function in validator_discovery. * From<u32> implementation for `ValidatorIndex`. * Fixes and more integration work. * Make session cache proper lru cache. * Use proper lru cache. * Requester finished. * ProtocolState -> Requester Also make sure to not fetch our own chunk. * Cleanup + fixes. * Remove unused functions - FetchTask::is_finished - SessionCache::fetch_session_info * availability-distribution responding side. * Cleanup + Fixes. * More fixes. * More fixes. adder-collator is running! * Some docs. * Docs. * Fix reporting of bad guys. * Fix tests * Make all tests compile. * Fix test. * Cleanup + get rid of some warnings. * state -> requester * Mostly doc fixes. * Fix test suite. * Get rid of now redundant message types. * WIP * Rob's review remarks. * Fix test suite. * core.relay_parent -> leaf for session request. * Style fix. * Decrease request timeout. * Cleanup obsolete errors. * Metrics + don't fail on non fatal errors. * requester.rs -> requester/mod.rs * Panic on invalid BadValidator report. * Fix indentation. * Use typed default timeout constant. * Make channel size 0, as each sender gets one slot anyways. * Fix incorrect metrics initialization. * Fix build after merge. * More fixes. * Hopefully valid metrics names. * Better metrics names. * Some tests that already work. * Slightly better docs. * Some more tests. * Fix network bridge test.
2026-04-26 16:57:58 +00:00 · 2021-02-26 18:58:07 +01:00
parent 241b1f12a7
commit 48409e5548
45 changed files with 2037 additions and 1523 deletions
@@ -300,14 +300,6 @@ pub mod v1 {
 	use super::RequestId;
 	use std::convert::TryFrom;

-	/// Network messages used by the availability distribution subsystem
-	#[derive(Debug, Clone, Encode, Decode, PartialEq, Eq)]
-	pub enum AvailabilityDistributionMessage {
-		/// An erasure chunk for a given candidate hash.
-		#[codec(index = 0)]
-		Chunk(CandidateHash, ErasureChunk),
-	}
-
 	/// Network messages used by the availability recovery subsystem.
 	#[derive(Debug, Clone, Encode, Decode, PartialEq, Eq)]
 	pub enum AvailabilityRecoveryMessage {
@@ -456,9 +448,6 @@ pub mod v1 {
 	/// All network messages on the validation peer-set.
 	#[derive(Debug, Clone, Encode, Decode, PartialEq, Eq)]
 	pub enum ValidationProtocol {
-		/// Availability distribution messages
-		#[codec(index = 0)]
-		AvailabilityDistribution(AvailabilityDistributionMessage),
 		/// Bitfield distribution messages
 		#[codec(index = 1)]
 		BitfieldDistribution(BitfieldDistributionMessage),
@@ -476,7 +465,6 @@ pub mod v1 {
 		ApprovalDistribution(ApprovalDistributionMessage),
 	}

-	impl_try_from!(ValidationProtocol, AvailabilityDistribution, AvailabilityDistributionMessage);
 	impl_try_from!(ValidationProtocol, BitfieldDistribution, BitfieldDistributionMessage);
 	impl_try_from!(ValidationProtocol, PoVDistribution, PoVDistributionMessage);
 	impl_try_from!(ValidationProtocol, StatementDistribution, StatementDistributionMessage);
@@ -60,7 +60,11 @@ pub enum Protocol {
 }

 /// Default request timeout in seconds.
-const DEFAULT_REQUEST_TIMEOUT: u64 = 8; 
+///
+/// When decreasing this value, take into account that the very first request might need to open a
+/// connection, which can be slow. If this causes problems, we should ensure connectivity via peer
+/// sets.
+const DEFAULT_REQUEST_TIMEOUT: Duration = Duration::from_secs(3); 

 impl Protocol {
 	/// Get a configuration for a given Request response protocol.
@@ -86,7 +90,7 @@ impl Protocol {
 				max_request_size: 10_000,
 				max_response_size: 1_000_000,
 				// Also just some relative conservative guess:
-				request_timeout: Duration::from_secs(DEFAULT_REQUEST_TIMEOUT),
+				request_timeout: DEFAULT_REQUEST_TIMEOUT,
 				inbound_queue: Some(tx),
 			},
 		};
@@ -22,6 +22,8 @@ use sc_network as network;
 use sc_network::config as netconfig;
 use sc_network::PeerId;

+use polkadot_primitives::v1::AuthorityDiscoveryId;
+
 use super::{v1, Protocol};

 /// Common properties of any `Request`.
@@ -69,7 +71,7 @@ impl Requests {
 #[derive(Debug)]
 pub struct OutgoingRequest<Req> {
 	/// Intendent recipient of this request.
-	pub peer: PeerId,
+	pub peer: AuthorityDiscoveryId,
 	/// The actual request to send over the wire.
 	pub payload: Req,
 	/// Sender which is used by networking to get us back a response.
@@ -98,7 +100,7 @@ where
 	/// It will contain a sender that is used by the networking for sending back responses. The
 	/// connected receiver is returned as the second element in the returned tuple.
 	pub fn new(
-		peer: PeerId,
+		peer: AuthorityDiscoveryId,
 		payload: Req,
 	) -> (
 		Self,
@@ -24,18 +24,54 @@ use super::request::IsRequest;
 use super::Protocol;

 /// Request an availability chunk.
-#[derive(Debug, Clone, Encode, Decode)]
+#[derive(Debug, Copy, Clone, Encode, Decode)]
 pub struct AvailabilityFetchingRequest {
-	candidate_hash: CandidateHash,
-	index: ValidatorIndex,
+	/// Hash of candidate we want a chunk for.
+	pub candidate_hash: CandidateHash,
+	/// The index of the chunk to fetch.
+	pub index: ValidatorIndex,
 }

 /// Receive a rqeuested erasure chunk.
 #[derive(Debug, Clone, Encode, Decode)]
 pub enum AvailabilityFetchingResponse {
-	/// The requested chunk.
+	/// The requested chunk data.
 	#[codec(index = 0)]
-	Chunk(ErasureChunk),
+	Chunk(ChunkResponse),
+	/// Node was not in possession of the requested chunk.
+	#[codec(index = 1)]
+	NoSuchChunk,
+}
+
+/// Skimmed down variant of `ErasureChunk`.
+///
+/// Instead of transmitting a full `ErasureChunk` we transmit `ChunkResponse` in
+/// `AvailabilityFetchingResponse`, which omits the chunk's index. The index is already known by
+/// the requester and by not transmitting it, we ensure the requester is going to use his index
+/// value for validating the response, thus making sure he got what he requested.
+#[derive(Debug, Clone, Encode, Decode)]
+pub struct ChunkResponse {
+	/// The erasure-encoded chunk of data belonging to the candidate block.
+	pub chunk: Vec<u8>,
+	/// Proof for this chunk's branch in the Merkle tree.
+	pub proof: Vec<Vec<u8>>,
+}
+
+impl From<ErasureChunk> for ChunkResponse {
+	fn from(ErasureChunk {chunk, index: _, proof}: ErasureChunk) -> Self {
+		ChunkResponse {chunk, proof}
+	}
+}
+
+impl ChunkResponse {
+	/// Re-build an `ErasureChunk` from response and request.
+	pub fn recombine_into_chunk(self, req: &AvailabilityFetchingRequest) -> ErasureChunk {
+		ErasureChunk {
+			chunk: self.chunk,
+			proof: self.proof,
+			index: req.index,
+		}
+	}
 }

 impl IsRequest for AvailabilityFetchingRequest {