availability-distribution: Retry failed fetches on next block. (#2762)

* availability-distribution: Retry on fail on next block.

Retry failed fetches on next block when still pending availability.

* Update node/network/availability-distribution/src/requester/fetch_task/mod.rs

Co-authored-by: Andronik Ordian <write@reusable.software>

* Fix existing tests.

* Add test for trying all validators.

* Add test for testing retries.

Co-authored-by: Andronik Ordian <write@reusable.software>
This commit is contained in:
Robert Klotzner
2021-03-30 00:28:43 +02:00
committed by GitHub
parent e906598e94
commit 0bc42785b4
6 changed files with 105 additions and 18 deletions
@@ -26,10 +26,7 @@ use polkadot_node_network_protocol::request_response::{
request::{OutgoingRequest, RequestError, Requests, Recipient},
v1::{ChunkFetchingRequest, ChunkFetchingResponse},
};
use polkadot_primitives::v1::{
AuthorityDiscoveryId, BlakeTwo256, GroupIndex, Hash, HashT, OccupiedCore,
SessionIndex,
};
use polkadot_primitives::v1::{AuthorityDiscoveryId, BlakeTwo256, CandidateHash, GroupIndex, Hash, HashT, OccupiedCore, SessionIndex};
use polkadot_node_primitives::ErasureChunk;
use polkadot_subsystem::messages::{
AllMessages, AvailabilityStoreMessage, NetworkBridgeMessage, IfDisconnected,
@@ -89,6 +86,9 @@ pub enum FromFetchTask {
/// In case of `None` everything was fine, in case of `Some`, some validators in the group
/// did not serve us our chunk as expected.
Concluded(Option<BadValidators>),
/// We were not able to fetch the desired chunk for the given `CandidateHash`.
Failed(CandidateHash),
}
/// Information a running task needs.
@@ -262,7 +262,7 @@ impl RunningTask {
/// Try validators in backing group in order.
async fn run_inner(mut self) {
let mut bad_validators = Vec::new();
let mut label = FAILED;
let mut succeeded = false;
let mut count: u32 = 0;
let mut _span = self.span.child("fetch-task")
.with_chunk_index(self.request.index.0)
@@ -315,13 +315,18 @@ impl RunningTask {
// Ok, let's store it and be happy:
self.store_chunk(chunk).await;
label = SUCCEEDED;
succeeded = true;
_span.add_string_tag("success", "true");
break;
}
_span.add_int_tag("tries", count as _);
self.metrics.on_fetch(label);
self.conclude(bad_validators).await;
if succeeded {
self.metrics.on_fetch(SUCCEEDED);
self.conclude(bad_validators).await;
} else {
self.metrics.on_fetch(FAILED);
self.conclude_fail().await
}
}
/// Do request and return response, if successful.
@@ -434,4 +439,14 @@ impl RunningTask {
);
}
}
async fn conclude_fail(&mut self) {
if let Err(err) = self.sender.send(FromFetchTask::Failed(self.request.candidate_hash)).await {
tracing::warn!(
target: LOG_TARGET,
?err,
"Sending `Failed` message for task failed"
);
}
}
}
@@ -228,6 +228,7 @@ impl TestRun {
);
match msg {
FromFetchTask::Concluded(_) => break,
FromFetchTask::Failed(_) => break,
FromFetchTask::Message(msg) =>
end_ok = self.handle_message(msg).await,
}