mirror of
https://github.com/pezkuwichain/pezkuwi-subxt.git
synced 2026-06-13 11:41:04 +00:00
Pvf refactor execute worker errors follow up (#4071)
follow up of https://github.com/paritytech/polkadot-sdk/pull/2604 closes https://github.com/paritytech/polkadot-sdk/pull/2604 - [x] take relevant changes from Marcin's PR - [x] extract common duplicate code for workers (low-hanging fruits) ~Some unpassed ci problems are more general and should be fixed in master (see https://github.com/paritytech/polkadot-sdk/pull/4074)~ Proposed labels: **T0-node**, **R0-silent**, **I4-refactor** ----- kusama address: FZXVQLqLbFV2otNXs6BMnNch54CFJ1idpWwjMb3Z8fTLQC6 --------- Co-authored-by: s0me0ne-unkn0wn <48632512+s0me0ne-unkn0wn@users.noreply.github.com>
This commit is contained in:
@@ -136,6 +136,9 @@ pub enum InternalValidationError {
|
||||
/// Could not find or open compiled artifact file.
|
||||
#[error("validation: could not find or open compiled artifact file: {0}")]
|
||||
CouldNotOpenFile(String),
|
||||
/// Could not create a pipe between the worker and a child process.
|
||||
#[error("validation: could not create pipe: {0}")]
|
||||
CouldNotCreatePipe(String),
|
||||
/// Host could not clear the worker cache after a job.
|
||||
#[error("validation: host could not clear the worker cache ({path:?}) after a job: {err}")]
|
||||
CouldNotClearWorkerDir {
|
||||
|
||||
@@ -30,35 +30,36 @@ pub struct Handshake {
|
||||
|
||||
/// The response from the execution worker.
|
||||
#[derive(Debug, Encode, Decode)]
|
||||
pub enum WorkerResponse {
|
||||
/// The job completed successfully.
|
||||
Ok {
|
||||
/// The result of parachain validation.
|
||||
result_descriptor: ValidationResult,
|
||||
/// The amount of CPU time taken by the job.
|
||||
duration: Duration,
|
||||
},
|
||||
/// The candidate is invalid.
|
||||
InvalidCandidate(String),
|
||||
/// Instantiation of the WASM module instance failed during an execution.
|
||||
/// Possibly related to local issues or dirty node update. May be retried with re-preparation.
|
||||
RuntimeConstruction(String),
|
||||
pub struct WorkerResponse {
|
||||
/// The response from the execute job process.
|
||||
pub job_response: JobResponse,
|
||||
/// The amount of CPU time taken by the job.
|
||||
pub duration: Duration,
|
||||
}
|
||||
|
||||
/// An error occurred in the worker process.
|
||||
#[derive(thiserror::Error, Debug, Clone, Encode, Decode)]
|
||||
pub enum WorkerError {
|
||||
/// The job timed out.
|
||||
#[error("The job timed out")]
|
||||
JobTimedOut,
|
||||
/// The job process has died. We must kill the worker just in case.
|
||||
///
|
||||
/// We cannot treat this as an internal error because malicious code may have killed the job.
|
||||
/// We still retry it, because in the non-malicious case it is likely spurious.
|
||||
#[error("The job process (pid {job_pid}) has died: {err}")]
|
||||
JobDied { err: String, job_pid: i32 },
|
||||
/// An unexpected error occurred in the job process, e.g. failing to spawn a thread, panic,
|
||||
/// etc.
|
||||
///
|
||||
/// Because malicious code can cause a job error, we must not treat it as an internal error. We
|
||||
/// still retry it, because in the non-malicious case it is likely spurious.
|
||||
JobError(String),
|
||||
#[error("An unexpected error occurred in the job process: {0}")]
|
||||
JobError(#[from] JobError),
|
||||
|
||||
/// Some internal error occurred.
|
||||
InternalError(InternalValidationError),
|
||||
#[error("An internal error occurred: {0}")]
|
||||
InternalError(#[from] InternalValidationError),
|
||||
}
|
||||
|
||||
/// The result of a job on the execution worker.
|
||||
@@ -101,7 +102,7 @@ impl JobResponse {
|
||||
/// An unexpected error occurred in the execution job process. Because this comes from the job,
|
||||
/// which executes untrusted code, this error must likewise be treated as untrusted. That is, we
|
||||
/// cannot raise an internal error based on this.
|
||||
#[derive(thiserror::Error, Debug, Encode, Decode)]
|
||||
#[derive(thiserror::Error, Clone, Debug, Encode, Decode)]
|
||||
pub enum JobError {
|
||||
#[error("The job timed out")]
|
||||
TimedOut,
|
||||
@@ -114,4 +115,7 @@ pub enum JobError {
|
||||
CouldNotSpawnThread(String),
|
||||
#[error("An error occurred in the CPU time monitor thread: {0}")]
|
||||
CpuTimeMonitorThread(String),
|
||||
/// Since the job can return any exit status it wants, we have to treat this as untrusted.
|
||||
#[error("Unexpected exit status: {0}")]
|
||||
UnexpectedExitStatus(i32),
|
||||
}
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//! Contains functionality related to PVFs that is shared by the PVF host and the PVF workers.
|
||||
#![deny(unused_crate_dependencies)]
|
||||
|
||||
pub mod error;
|
||||
pub mod execute;
|
||||
|
||||
@@ -18,12 +18,7 @@ use crate::prepare::PrepareJobKind;
|
||||
use parity_scale_codec::{Decode, Encode};
|
||||
use polkadot_parachain_primitives::primitives::ValidationCodeHash;
|
||||
use polkadot_primitives::ExecutorParams;
|
||||
use std::{
|
||||
cmp::{Eq, PartialEq},
|
||||
fmt,
|
||||
sync::Arc,
|
||||
time::Duration,
|
||||
};
|
||||
use std::{fmt, sync::Arc, time::Duration};
|
||||
|
||||
/// A struct that carries the exhaustive set of data to prepare an artifact out of plain
|
||||
/// Wasm binary
|
||||
|
||||
@@ -18,10 +18,13 @@
|
||||
|
||||
pub mod security;
|
||||
|
||||
use crate::{framed_recv_blocking, SecurityStatus, WorkerHandshake, LOG_TARGET};
|
||||
use crate::{
|
||||
framed_recv_blocking, framed_send_blocking, SecurityStatus, WorkerHandshake, LOG_TARGET,
|
||||
};
|
||||
use cpu_time::ProcessTime;
|
||||
use futures::never::Never;
|
||||
use parity_scale_codec::Decode;
|
||||
use nix::{errno::Errno, sys::resource::Usage};
|
||||
use parity_scale_codec::{Decode, Encode};
|
||||
use std::{
|
||||
any::Any,
|
||||
fmt::{self},
|
||||
@@ -58,8 +61,6 @@ macro_rules! decl_worker_main {
|
||||
|
||||
$crate::sp_tracing::try_init_simple();
|
||||
|
||||
let worker_pid = std::process::id();
|
||||
|
||||
let args = std::env::args().collect::<Vec<_>>();
|
||||
if args.len() == 1 {
|
||||
print_help($expected_command);
|
||||
@@ -548,6 +549,81 @@ fn recv_worker_handshake(stream: &mut UnixStream) -> io::Result<WorkerHandshake>
|
||||
Ok(worker_handshake)
|
||||
}
|
||||
|
||||
/// Calculate the total CPU time from the given `usage` structure, returned from
|
||||
/// [`nix::sys::resource::getrusage`], and calculates the total CPU time spent, including both user
|
||||
/// and system time.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// - `rusage`: Contains resource usage information.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns a `Duration` representing the total CPU time.
|
||||
pub fn get_total_cpu_usage(rusage: Usage) -> Duration {
|
||||
let micros = (((rusage.user_time().tv_sec() + rusage.system_time().tv_sec()) * 1_000_000) +
|
||||
(rusage.system_time().tv_usec() + rusage.user_time().tv_usec()) as i64) as u64;
|
||||
|
||||
return Duration::from_micros(micros)
|
||||
}
|
||||
|
||||
/// Get a job response.
|
||||
pub fn recv_child_response<T>(
|
||||
received_data: &mut io::BufReader<&[u8]>,
|
||||
context: &'static str,
|
||||
) -> io::Result<T>
|
||||
where
|
||||
T: Decode,
|
||||
{
|
||||
let response_bytes = framed_recv_blocking(received_data)?;
|
||||
T::decode(&mut response_bytes.as_slice()).map_err(|e| {
|
||||
io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
format!("{} pvf recv_child_response: decode error: {}", context, e),
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn send_result<T, E>(
|
||||
stream: &mut UnixStream,
|
||||
result: Result<T, E>,
|
||||
worker_info: &WorkerInfo,
|
||||
) -> io::Result<()>
|
||||
where
|
||||
T: std::fmt::Debug,
|
||||
E: std::fmt::Debug + std::fmt::Display,
|
||||
Result<T, E>: Encode,
|
||||
{
|
||||
if let Err(ref err) = result {
|
||||
gum::warn!(
|
||||
target: LOG_TARGET,
|
||||
?worker_info,
|
||||
"worker: error occurred: {}",
|
||||
err
|
||||
);
|
||||
}
|
||||
gum::trace!(
|
||||
target: LOG_TARGET,
|
||||
?worker_info,
|
||||
"worker: sending result to host: {:?}",
|
||||
result
|
||||
);
|
||||
|
||||
framed_send_blocking(stream, &result.encode()).map_err(|err| {
|
||||
gum::warn!(
|
||||
target: LOG_TARGET,
|
||||
?worker_info,
|
||||
"worker: error occurred sending result to host: {}",
|
||||
err
|
||||
);
|
||||
err
|
||||
})
|
||||
}
|
||||
|
||||
pub fn stringify_errno(context: &'static str, errno: Errno) -> String {
|
||||
format!("{}: {}: {}", context, errno, io::Error::last_os_error())
|
||||
}
|
||||
|
||||
/// Functionality related to threads spawned by the workers.
|
||||
///
|
||||
/// The motivation for this module is to coordinate worker threads without using async Rust.
|
||||
|
||||
Reference in New Issue
Block a user