Move PVF timeouts to executor environment parameters (#6823)

* Move PVF timeouts to executor environment parameters

* Typo

Co-authored-by: Marcin S. <marcin@realemail.net>

* Fix comments

* Change handle_import_statements to FatalResult (#6820)

* Changing dispute db errors to fatal

* fmt

* Change node-key for bootnodes (#6772)

* Additional tracing in `provisioner`, `vote_selection` and `dispute-coordinator` (#6775)

* Additional tracing in `provisioner`, `vote_selection`

* Add `fetched_onchain_disputes` metric to provisioner

* Some tracelines in dispute-coordinator

TODO: cherry pick this in the initial branch!!!

* Remove spammy logs

* Remove some trace lines

* Rename and fix things

* Fix comments

* Typo

* Minor fixes

* Add codec indexes; Remove macro

---------

Co-authored-by: Marcin S. <marcin@realemail.net>
Co-authored-by: Bradley Olson <34992650+BradleyOlson64@users.noreply.github.com>
Co-authored-by: Petr Mensik <petr.mensik1@gmail.com>
Co-authored-by: Tsvetomir Dimitrov <tsvetomir@parity.io>
This commit is contained in:
s0me0ne-unkn0wn
2023-03-08 23:43:51 +01:00
committed by GitHub
parent 1c2215a75a
commit 03d4af104f
25 changed files with 359 additions and 415 deletions
@@ -24,7 +24,7 @@
#![warn(missing_docs)]
use polkadot_node_core_pvf::{
InvalidCandidate as WasmInvalidCandidate, PrepareError, PrepareStats, PvfWithExecutorParams,
InvalidCandidate as WasmInvalidCandidate, PrepareError, PrepareStats, PvfPrepData,
ValidationError, ValidationHost,
};
use polkadot_node_primitives::{
@@ -43,7 +43,8 @@ use polkadot_node_subsystem_util::executor_params_at_relay_parent;
use polkadot_parachain::primitives::{ValidationParams, ValidationResult as WasmValidationResult};
use polkadot_primitives::{
vstaging::ExecutorParams, CandidateCommitments, CandidateDescriptor, CandidateReceipt, Hash,
OccupiedCoreAssumption, PersistedValidationData, ValidationCode, ValidationCodeHash,
OccupiedCoreAssumption, PersistedValidationData, PvfExecTimeoutKind, PvfPrepTimeoutKind,
ValidationCode, ValidationCodeHash,
};
use parity_scale_codec::Encode;
@@ -68,6 +69,13 @@ const PVF_EXECUTION_RETRY_DELAY: Duration = Duration::from_secs(3);
#[cfg(test)]
const PVF_EXECUTION_RETRY_DELAY: Duration = Duration::from_millis(200);
// Default PVF timeouts. Must never be changed! Use executor environment parameters in
// `session_info` pallet to adjust them. See also `PvfTimeoutKind` docs.
const DEFAULT_PRECHECK_PREPARATION_TIMEOUT: Duration = Duration::from_secs(60);
const DEFAULT_LENIENT_PREPARATION_TIMEOUT: Duration = Duration::from_secs(360);
const DEFAULT_BACKING_EXECUTION_TIMEOUT: Duration = Duration::from_secs(2);
const DEFAULT_APPROVAL_EXECUTION_TIMEOUT: Duration = Duration::from_secs(12);
/// Configuration for the candidate validation subsystem
#[derive(Clone)]
pub struct Config {
@@ -330,18 +338,20 @@ where
return PreCheckOutcome::Invalid
};
let pvf_with_params = match sp_maybe_compressed_blob::decompress(
let timeout = pvf_prep_timeout(&executor_params, PvfPrepTimeoutKind::Precheck);
let pvf = match sp_maybe_compressed_blob::decompress(
&validation_code.0,
VALIDATION_CODE_BOMB_LIMIT,
) {
Ok(code) => PvfWithExecutorParams::from_code(code.into_owned(), executor_params),
Ok(code) => PvfPrepData::from_code(code.into_owned(), executor_params, timeout),
Err(e) => {
gum::debug!(target: LOG_TARGET, err=?e, "precheck: cannot decompress validation code");
return PreCheckOutcome::Invalid
},
};
match validation_backend.precheck_pvf(pvf_with_params).await {
match validation_backend.precheck_pvf(pvf).await {
Ok(_) => PreCheckOutcome::Valid,
Err(prepare_err) =>
if prepare_err.is_deterministic() {
@@ -465,7 +475,7 @@ async fn validate_from_chain_state<Sender>(
validation_host: ValidationHost,
candidate_receipt: CandidateReceipt,
pov: Arc<PoV>,
timeout: Duration,
exec_timeout_kind: PvfExecTimeoutKind,
metrics: &Metrics,
) -> Result<ValidationResult, ValidationFailed>
where
@@ -485,7 +495,7 @@ where
validation_code,
candidate_receipt.clone(),
pov,
timeout,
exec_timeout_kind,
metrics,
)
.await;
@@ -521,7 +531,7 @@ async fn validate_candidate_exhaustive<Sender>(
validation_code: ValidationCode,
candidate_receipt: CandidateReceipt,
pov: Arc<PoV>,
timeout: Duration,
exec_timeout_kind: PvfExecTimeoutKind,
metrics: &Metrics,
) -> Result<ValidationResult, ValidationFailed>
where
@@ -606,7 +616,7 @@ where
let result = validation_backend
.validate_candidate_with_retry(
raw_validation_code.to_vec(),
timeout,
pvf_exec_timeout(&executor_params, exec_timeout_kind),
params,
executor_params,
)
@@ -667,8 +677,8 @@ trait ValidationBackend {
/// Tries executing a PVF a single time (no retries).
async fn validate_candidate(
&mut self,
pvf_with_params: PvfWithExecutorParams,
timeout: Duration,
pvf: PvfPrepData,
exec_timeout: Duration,
encoded_params: Vec<u8>,
) -> Result<WasmValidationResult, ValidationError>;
@@ -677,16 +687,16 @@ trait ValidationBackend {
async fn validate_candidate_with_retry(
&mut self,
raw_validation_code: Vec<u8>,
timeout: Duration,
exec_timeout: Duration,
params: ValidationParams,
executor_params: ExecutorParams,
) -> Result<WasmValidationResult, ValidationError> {
// Construct the PVF a single time, since it is an expensive operation. Cloning it is cheap.
let pvf_with_params =
PvfWithExecutorParams::from_code(raw_validation_code, executor_params);
let prep_timeout = pvf_prep_timeout(&executor_params, PvfPrepTimeoutKind::Lenient);
let pvf = PvfPrepData::from_code(raw_validation_code, executor_params, prep_timeout);
let mut validation_result =
self.validate_candidate(pvf_with_params.clone(), timeout, params.encode()).await;
self.validate_candidate(pvf.clone(), exec_timeout, params.encode()).await;
// If we get an AmbiguousWorkerDeath error, retry once after a brief delay, on the
// assumption that the conditions that caused this error may have been transient. Note that
@@ -699,23 +709,19 @@ trait ValidationBackend {
gum::warn!(
target: LOG_TARGET,
?pvf_with_params,
?pvf,
"Re-trying failed candidate validation due to AmbiguousWorkerDeath."
);
// Encode the params again when re-trying. We expect the retry case to be relatively
// rare, and we want to avoid unconditionally cloning data.
validation_result =
self.validate_candidate(pvf_with_params, timeout, params.encode()).await;
validation_result = self.validate_candidate(pvf, exec_timeout, params.encode()).await;
}
validation_result
}
async fn precheck_pvf(
&mut self,
pvf_with_params: PvfWithExecutorParams,
) -> Result<PrepareStats, PrepareError>;
async fn precheck_pvf(&mut self, pvf: PvfPrepData) -> Result<PrepareStats, PrepareError>;
}
#[async_trait]
@@ -723,16 +729,14 @@ impl ValidationBackend for ValidationHost {
/// Tries executing a PVF a single time (no retries).
async fn validate_candidate(
&mut self,
pvf_with_params: PvfWithExecutorParams,
timeout: Duration,
pvf: PvfPrepData,
exec_timeout: Duration,
encoded_params: Vec<u8>,
) -> Result<WasmValidationResult, ValidationError> {
let priority = polkadot_node_core_pvf::Priority::Normal;
let (tx, rx) = oneshot::channel();
if let Err(err) =
self.execute_pvf(pvf_with_params, timeout, encoded_params, priority, tx).await
{
if let Err(err) = self.execute_pvf(pvf, exec_timeout, encoded_params, priority, tx).await {
return Err(ValidationError::InternalError(format!(
"cannot send pvf to the validation host: {:?}",
err
@@ -743,12 +747,9 @@ impl ValidationBackend for ValidationHost {
.map_err(|_| ValidationError::InternalError("validation was cancelled".into()))?
}
async fn precheck_pvf(
&mut self,
pvf_with_params: PvfWithExecutorParams,
) -> Result<PrepareStats, PrepareError> {
async fn precheck_pvf(&mut self, pvf: PvfPrepData) -> Result<PrepareStats, PrepareError> {
let (tx, rx) = oneshot::channel();
if let Err(err) = self.precheck_pvf(pvf_with_params, tx).await {
if let Err(err) = self.precheck_pvf(pvf, tx).await {
// Return an IO error if there was an error communicating with the host.
return Err(PrepareError::IoErr(err))
}
@@ -788,3 +789,23 @@ fn perform_basic_checks(
Ok(())
}
fn pvf_prep_timeout(executor_params: &ExecutorParams, kind: PvfPrepTimeoutKind) -> Duration {
if let Some(timeout) = executor_params.pvf_prep_timeout(kind) {
return timeout
}
match kind {
PvfPrepTimeoutKind::Precheck => DEFAULT_PRECHECK_PREPARATION_TIMEOUT,
PvfPrepTimeoutKind::Lenient => DEFAULT_LENIENT_PREPARATION_TIMEOUT,
}
}
fn pvf_exec_timeout(executor_params: &ExecutorParams, kind: PvfExecTimeoutKind) -> Duration {
if let Some(timeout) = executor_params.pvf_exec_timeout(kind) {
return timeout
}
match kind {
PvfExecTimeoutKind::Backing => DEFAULT_BACKING_EXECUTION_TIMEOUT,
PvfExecTimeoutKind::Approval => DEFAULT_APPROVAL_EXECUTION_TIMEOUT,
}
}
@@ -396,7 +396,7 @@ impl MockValidateCandidateBackend {
impl ValidationBackend for MockValidateCandidateBackend {
async fn validate_candidate(
&mut self,
_pvf_with_params: PvfWithExecutorParams,
_pvf: PvfPrepData,
_timeout: Duration,
_encoded_params: Vec<u8>,
) -> Result<WasmValidationResult, ValidationError> {
@@ -408,10 +408,7 @@ impl ValidationBackend for MockValidateCandidateBackend {
result
}
async fn precheck_pvf(
&mut self,
_pvf_with_params: PvfWithExecutorParams,
) -> Result<PrepareStats, PrepareError> {
async fn precheck_pvf(&mut self, _pvf: PvfPrepData) -> Result<PrepareStats, PrepareError> {
unreachable!()
}
}
@@ -476,7 +473,7 @@ fn candidate_validation_ok_is_ok() {
validation_code,
candidate_receipt,
Arc::new(pov),
Duration::from_secs(0),
PvfExecTimeoutKind::Backing,
&metrics,
)
})
@@ -535,7 +532,7 @@ fn candidate_validation_bad_return_is_invalid() {
validation_code,
candidate_receipt,
Arc::new(pov),
Duration::from_secs(0),
PvfExecTimeoutKind::Backing,
&metrics,
)
});
@@ -606,7 +603,7 @@ fn candidate_validation_one_ambiguous_error_is_valid() {
validation_code,
candidate_receipt,
Arc::new(pov),
Duration::from_secs(0),
PvfExecTimeoutKind::Backing,
&metrics,
)
})
@@ -666,7 +663,7 @@ fn candidate_validation_multiple_ambiguous_errors_is_invalid() {
validation_code,
candidate_receipt,
Arc::new(pov),
Duration::from_secs(0),
PvfExecTimeoutKind::Backing,
&metrics,
)
})
@@ -718,7 +715,7 @@ fn candidate_validation_timeout_is_internal_error() {
validation_code,
candidate_receipt,
Arc::new(pov),
Duration::from_secs(0),
PvfExecTimeoutKind::Backing,
&metrics,
)
});
@@ -770,7 +767,7 @@ fn candidate_validation_commitment_hash_mismatch_is_invalid() {
validation_code,
candidate_receipt,
Arc::new(pov),
Duration::from_secs(0),
PvfExecTimeoutKind::Backing,
&metrics,
)
})
@@ -821,7 +818,7 @@ fn candidate_validation_code_mismatch_is_invalid() {
validation_code,
candidate_receipt,
Arc::new(pov),
Duration::from_secs(0),
PvfExecTimeoutKind::Backing,
&Default::default(),
))
.unwrap();
@@ -884,7 +881,7 @@ fn compressed_code_works() {
validation_code,
candidate_receipt,
Arc::new(pov),
Duration::from_secs(0),
PvfExecTimeoutKind::Backing,
&metrics,
)
});
@@ -937,7 +934,7 @@ fn code_decompression_failure_is_error() {
validation_code,
candidate_receipt,
Arc::new(pov),
Duration::from_secs(0),
PvfExecTimeoutKind::Backing,
&Default::default(),
));
@@ -990,7 +987,7 @@ fn pov_decompression_failure_is_invalid() {
validation_code,
candidate_receipt,
Arc::new(pov),
Duration::from_secs(0),
PvfExecTimeoutKind::Backing,
&Default::default(),
));
@@ -1011,17 +1008,14 @@ impl MockPreCheckBackend {
impl ValidationBackend for MockPreCheckBackend {
async fn validate_candidate(
&mut self,
_pvf_with_params: PvfWithExecutorParams,
_pvf: PvfPrepData,
_timeout: Duration,
_encoded_params: Vec<u8>,
) -> Result<WasmValidationResult, ValidationError> {
unreachable!()
}
async fn precheck_pvf(
&mut self,
_pvf_with_params: PvfWithExecutorParams,
) -> Result<PrepareStats, PrepareError> {
async fn precheck_pvf(&mut self, _pvf: PvfPrepData) -> Result<PrepareStats, PrepareError> {
self.result.clone()
}
}