mirror of
https://github.com/pezkuwichain/pezkuwi-subxt.git
synced 2026-06-18 09:31:02 +00:00
Further improved availability recovery (#3711)
* WiP. * Things compile. * cargo fmt * Passing tests + fix warnings. * Metrics for availability recovery. * Basic test. * Fix typos and actually check for overflow. * cargo fmt * Register metrics. * More tests. * Fix warning. * cargo +nightly fmt * Fix metrics * Get rid of unsafe. * tabify * spellcheck Co-authored-by: Andronik Ordian <write@reusable.software> Co-authored-by: Bastian Köcher <info@kchr.de>
This commit is contained in:
@@ -53,7 +53,8 @@ fn test_harness_fast_path<T: Future<Output = (VirtualOverseer, RequestResponseCo
|
||||
let (context, virtual_overseer) = test_helpers::make_subsystem_context(pool.clone());
|
||||
|
||||
let (collation_req_receiver, req_cfg) = IncomingRequest::get_config_receiver();
|
||||
let subsystem = AvailabilityRecoverySubsystem::with_fast_path(collation_req_receiver);
|
||||
let subsystem =
|
||||
AvailabilityRecoverySubsystem::with_fast_path(collation_req_receiver, Metrics::new_dummy());
|
||||
let subsystem = async {
|
||||
subsystem.run(context).await.unwrap();
|
||||
};
|
||||
@@ -86,7 +87,10 @@ fn test_harness_chunks_only<T: Future<Output = (VirtualOverseer, RequestResponse
|
||||
let (context, virtual_overseer) = test_helpers::make_subsystem_context(pool.clone());
|
||||
|
||||
let (collation_req_receiver, req_cfg) = IncomingRequest::get_config_receiver();
|
||||
let subsystem = AvailabilityRecoverySubsystem::with_chunks_only(collation_req_receiver);
|
||||
let subsystem = AvailabilityRecoverySubsystem::with_chunks_only(
|
||||
collation_req_receiver,
|
||||
Metrics::new_dummy(),
|
||||
);
|
||||
let subsystem = subsystem.run(context);
|
||||
|
||||
let test_fut = test(virtual_overseer, req_cfg);
|
||||
@@ -153,6 +157,11 @@ enum Has {
|
||||
No,
|
||||
Yes,
|
||||
NetworkError(sc_network::RequestFailure),
|
||||
/// Make request not return at all, instead the sender is returned from the function.
|
||||
///
|
||||
/// Note, if you use `DoesNotReturn` you have to keep the returned senders alive, otherwise the
|
||||
/// subsystem will receive a cancel event and the request actually does return.
|
||||
DoesNotReturn,
|
||||
}
|
||||
|
||||
impl Has {
|
||||
@@ -255,42 +264,50 @@ impl TestState {
|
||||
virtual_overseer: &mut VirtualOverseer,
|
||||
n: usize,
|
||||
who_has: impl Fn(usize) -> Has,
|
||||
) {
|
||||
) -> Vec<oneshot::Sender<std::result::Result<Vec<u8>, RequestFailure>>> {
|
||||
// arbitrary order.
|
||||
for _ in 0..n {
|
||||
let mut i = 0;
|
||||
let mut senders = Vec::new();
|
||||
while i < n {
|
||||
// Receive a request for a chunk.
|
||||
assert_matches!(
|
||||
overseer_recv(virtual_overseer).await,
|
||||
AllMessages::NetworkBridge(
|
||||
NetworkBridgeMessage::SendRequests(
|
||||
mut requests,
|
||||
requests,
|
||||
IfDisconnected::TryConnect,
|
||||
)
|
||||
) => {
|
||||
assert_eq!(requests.len(), 1);
|
||||
for req in requests {
|
||||
i += 1;
|
||||
assert_matches!(
|
||||
req,
|
||||
Requests::ChunkFetching(req) => {
|
||||
assert_eq!(req.payload.candidate_hash, candidate_hash);
|
||||
|
||||
assert_matches!(
|
||||
requests.pop().unwrap(),
|
||||
Requests::ChunkFetching(req) => {
|
||||
assert_eq!(req.payload.candidate_hash, candidate_hash);
|
||||
let validator_index = req.payload.index.0 as usize;
|
||||
let available_data = match who_has(validator_index) {
|
||||
Has::No => Ok(None),
|
||||
Has::Yes => Ok(Some(self.chunks[validator_index].clone().into())),
|
||||
Has::NetworkError(e) => Err(e),
|
||||
Has::DoesNotReturn => {
|
||||
senders.push(req.pending_response);
|
||||
continue
|
||||
}
|
||||
};
|
||||
|
||||
let validator_index = req.payload.index.0 as usize;
|
||||
let available_data = match who_has(validator_index) {
|
||||
Has::No => Ok(None),
|
||||
Has::Yes => Ok(Some(self.chunks[validator_index].clone().into())),
|
||||
Has::NetworkError(e) => Err(e),
|
||||
};
|
||||
|
||||
let _ = req.pending_response.send(
|
||||
available_data.map(|r|
|
||||
req_res::v1::ChunkFetchingResponse::from(r).encode()
|
||||
)
|
||||
);
|
||||
}
|
||||
)
|
||||
let _ = req.pending_response.send(
|
||||
available_data.map(|r|
|
||||
req_res::v1::ChunkFetchingResponse::from(r).encode()
|
||||
)
|
||||
);
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
);
|
||||
}
|
||||
senders
|
||||
}
|
||||
|
||||
async fn test_full_data_requests(
|
||||
@@ -298,7 +315,8 @@ impl TestState {
|
||||
candidate_hash: CandidateHash,
|
||||
virtual_overseer: &mut VirtualOverseer,
|
||||
who_has: impl Fn(usize) -> Has,
|
||||
) {
|
||||
) -> Vec<oneshot::Sender<std::result::Result<Vec<u8>, sc_network::RequestFailure>>> {
|
||||
let mut senders = Vec::new();
|
||||
for _ in 0..self.validators.len() {
|
||||
// Receive a request for a chunk.
|
||||
assert_matches!(
|
||||
@@ -324,6 +342,10 @@ impl TestState {
|
||||
Has::No => Ok(None),
|
||||
Has::Yes => Ok(Some(self.available_data.clone())),
|
||||
Has::NetworkError(e) => Err(e),
|
||||
Has::DoesNotReturn => {
|
||||
senders.push(req.pending_response);
|
||||
continue
|
||||
}
|
||||
};
|
||||
|
||||
let done = available_data.as_ref().ok().map_or(false, |x| x.is_some());
|
||||
@@ -340,6 +362,7 @@ impl TestState {
|
||||
}
|
||||
);
|
||||
}
|
||||
senders
|
||||
}
|
||||
}
|
||||
|
||||
@@ -980,13 +1003,12 @@ fn chunks_retry_until_all_nodes_respond() {
|
||||
.test_chunk_requests(
|
||||
candidate_hash,
|
||||
&mut virtual_overseer,
|
||||
test_state.validators.len(),
|
||||
test_state.validators.len() - test_state.threshold(),
|
||||
|_| Has::timeout(),
|
||||
)
|
||||
.await;
|
||||
|
||||
// we get to go another round!
|
||||
|
||||
test_state
|
||||
.test_chunk_requests(
|
||||
candidate_hash,
|
||||
@@ -1002,6 +1024,155 @@ fn chunks_retry_until_all_nodes_respond() {
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn not_returning_requests_wont_stall_retrieval() {
|
||||
let test_state = TestState::default();
|
||||
|
||||
test_harness_chunks_only(|mut virtual_overseer, req_cfg| async move {
|
||||
overseer_signal(
|
||||
&mut virtual_overseer,
|
||||
OverseerSignal::ActiveLeaves(ActiveLeavesUpdate::start_work(ActivatedLeaf {
|
||||
hash: test_state.current.clone(),
|
||||
number: 1,
|
||||
status: LeafStatus::Fresh,
|
||||
span: Arc::new(jaeger::Span::Disabled),
|
||||
})),
|
||||
)
|
||||
.await;
|
||||
|
||||
let (tx, rx) = oneshot::channel();
|
||||
|
||||
overseer_send(
|
||||
&mut virtual_overseer,
|
||||
AvailabilityRecoveryMessage::RecoverAvailableData(
|
||||
test_state.candidate.clone(),
|
||||
test_state.session_index,
|
||||
Some(GroupIndex(0)),
|
||||
tx,
|
||||
),
|
||||
)
|
||||
.await;
|
||||
|
||||
test_state.test_runtime_api(&mut virtual_overseer).await;
|
||||
|
||||
let candidate_hash = test_state.candidate.hash();
|
||||
|
||||
test_state.respond_to_available_data_query(&mut virtual_overseer, false).await;
|
||||
test_state.respond_to_query_all_request(&mut virtual_overseer, |_| false).await;
|
||||
|
||||
// How many validators should not respond at all:
|
||||
let not_returning_count = 1;
|
||||
|
||||
// Not returning senders won't cause the retrieval to stall:
|
||||
let _senders = test_state
|
||||
.test_chunk_requests(candidate_hash, &mut virtual_overseer, not_returning_count, |_| {
|
||||
Has::DoesNotReturn
|
||||
})
|
||||
.await;
|
||||
|
||||
test_state
|
||||
.test_chunk_requests(
|
||||
candidate_hash,
|
||||
&mut virtual_overseer,
|
||||
// Should start over:
|
||||
test_state.validators.len() + 3,
|
||||
|_| Has::timeout(),
|
||||
)
|
||||
.await;
|
||||
|
||||
// we get to go another round!
|
||||
test_state
|
||||
.test_chunk_requests(
|
||||
candidate_hash,
|
||||
&mut virtual_overseer,
|
||||
test_state.threshold(),
|
||||
|_| Has::Yes,
|
||||
)
|
||||
.await;
|
||||
|
||||
// Recovered data should match the original one:
|
||||
assert_eq!(rx.await.unwrap().unwrap(), test_state.available_data);
|
||||
(virtual_overseer, req_cfg)
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn all_not_returning_requests_still_recovers_on_return() {
|
||||
let test_state = TestState::default();
|
||||
|
||||
test_harness_chunks_only(|mut virtual_overseer, req_cfg| async move {
|
||||
overseer_signal(
|
||||
&mut virtual_overseer,
|
||||
OverseerSignal::ActiveLeaves(ActiveLeavesUpdate::start_work(ActivatedLeaf {
|
||||
hash: test_state.current.clone(),
|
||||
number: 1,
|
||||
status: LeafStatus::Fresh,
|
||||
span: Arc::new(jaeger::Span::Disabled),
|
||||
})),
|
||||
)
|
||||
.await;
|
||||
|
||||
let (tx, rx) = oneshot::channel();
|
||||
|
||||
overseer_send(
|
||||
&mut virtual_overseer,
|
||||
AvailabilityRecoveryMessage::RecoverAvailableData(
|
||||
test_state.candidate.clone(),
|
||||
test_state.session_index,
|
||||
Some(GroupIndex(0)),
|
||||
tx,
|
||||
),
|
||||
)
|
||||
.await;
|
||||
|
||||
test_state.test_runtime_api(&mut virtual_overseer).await;
|
||||
|
||||
let candidate_hash = test_state.candidate.hash();
|
||||
|
||||
test_state.respond_to_available_data_query(&mut virtual_overseer, false).await;
|
||||
test_state.respond_to_query_all_request(&mut virtual_overseer, |_| false).await;
|
||||
|
||||
let senders = test_state
|
||||
.test_chunk_requests(
|
||||
candidate_hash,
|
||||
&mut virtual_overseer,
|
||||
test_state.validators.len(),
|
||||
|_| Has::DoesNotReturn,
|
||||
)
|
||||
.await;
|
||||
|
||||
future::join(
|
||||
async {
|
||||
Delay::new(Duration::from_millis(10)).await;
|
||||
// Now retrieval should be able to recover.
|
||||
std::mem::drop(senders);
|
||||
},
|
||||
test_state.test_chunk_requests(
|
||||
candidate_hash,
|
||||
&mut virtual_overseer,
|
||||
// Should start over:
|
||||
test_state.validators.len() + 3,
|
||||
|_| Has::timeout(),
|
||||
),
|
||||
)
|
||||
.await;
|
||||
|
||||
// we get to go another round!
|
||||
test_state
|
||||
.test_chunk_requests(
|
||||
candidate_hash,
|
||||
&mut virtual_overseer,
|
||||
test_state.threshold(),
|
||||
|_| Has::Yes,
|
||||
)
|
||||
.await;
|
||||
|
||||
// Recovered data should match the original one:
|
||||
assert_eq!(rx.await.unwrap().unwrap(), test_state.available_data);
|
||||
(virtual_overseer, req_cfg)
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn returns_early_if_we_have_the_data() {
|
||||
let test_state = TestState::default();
|
||||
@@ -1097,3 +1268,34 @@ fn does_not_query_local_validator() {
|
||||
(virtual_overseer, req_cfg)
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parallel_request_calculation_works_as_expected() {
|
||||
let num_validators = 100;
|
||||
let threshold = recovery_threshold(num_validators).unwrap();
|
||||
let mut phase = RequestChunksPhase::new(100);
|
||||
assert_eq!(phase.get_desired_request_count(threshold), threshold);
|
||||
phase.error_count = 1;
|
||||
phase.total_received_responses = 1;
|
||||
// We saturate at threshold (34):
|
||||
assert_eq!(phase.get_desired_request_count(threshold), threshold);
|
||||
|
||||
let dummy_chunk =
|
||||
ErasureChunk { chunk: Vec::new(), index: ValidatorIndex(0), proof: Proof::dummy_proof() };
|
||||
phase.received_chunks.insert(ValidatorIndex(0), dummy_chunk.clone());
|
||||
phase.total_received_responses = 2;
|
||||
// With given error rate - still saturating:
|
||||
assert_eq!(phase.get_desired_request_count(threshold), threshold);
|
||||
for i in 1..9 {
|
||||
phase.received_chunks.insert(ValidatorIndex(i), dummy_chunk.clone());
|
||||
}
|
||||
phase.total_received_responses += 8;
|
||||
// error rate: 1/10
|
||||
// remaining chunks needed: threshold (34) - 9
|
||||
// expected: 24 * (1+ 1/10) = (next greater integer) = 27
|
||||
assert_eq!(phase.get_desired_request_count(threshold), 27);
|
||||
phase.received_chunks.insert(ValidatorIndex(9), dummy_chunk.clone());
|
||||
phase.error_count = 0;
|
||||
// With error count zero - we should fetch exactly as needed:
|
||||
assert_eq!(phase.get_desired_request_count(threshold), threshold - phase.received_chunks.len());
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user