mirror of
https://github.com/pezkuwichain/pezkuwi-subxt.git
synced 2026-06-13 01:11:10 +00:00
change prepare worker to use fork instead of threads (#1685)
Co-authored-by: Marcin S <marcin@realemail.net>
This commit is contained in:
@@ -342,20 +342,27 @@ fn handle_job_finish(
|
||||
},
|
||||
Outcome::InvalidCandidate { err, idle_worker } => (
|
||||
Some(idle_worker),
|
||||
Err(ValidationError::InvalidCandidate(InvalidCandidate::WorkerReportedError(err))),
|
||||
Err(ValidationError::InvalidCandidate(InvalidCandidate::WorkerReportedInvalid(err))),
|
||||
None,
|
||||
),
|
||||
Outcome::InternalError { err } => (None, Err(ValidationError::InternalError(err)), None),
|
||||
// Either the worker or the job timed out. Kill the worker in either case. Treated as
|
||||
// definitely-invalid, because if we timed out, there's no time left for a retry.
|
||||
Outcome::HardTimeout =>
|
||||
(None, Err(ValidationError::InvalidCandidate(InvalidCandidate::HardTimeout)), None),
|
||||
// "Maybe invalid" errors (will retry).
|
||||
Outcome::IoErr => (
|
||||
Outcome::WorkerIntfErr => (
|
||||
None,
|
||||
Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousWorkerDeath)),
|
||||
None,
|
||||
),
|
||||
Outcome::Panic { err } =>
|
||||
(None, Err(ValidationError::InvalidCandidate(InvalidCandidate::Panic(err))), None),
|
||||
Outcome::JobDied { err } => (
|
||||
None,
|
||||
Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousJobDeath(err))),
|
||||
None,
|
||||
),
|
||||
Outcome::JobError { err } =>
|
||||
(None, Err(ValidationError::InvalidCandidate(InvalidCandidate::JobError(err))), None),
|
||||
};
|
||||
|
||||
queue.metrics.execute_finished();
|
||||
|
||||
@@ -30,7 +30,7 @@ use futures_timer::Delay;
|
||||
use parity_scale_codec::{Decode, Encode};
|
||||
use polkadot_node_core_pvf_common::{
|
||||
error::InternalValidationError,
|
||||
execute::{Handshake, Response},
|
||||
execute::{Handshake, WorkerResponse},
|
||||
worker_dir, SecurityStatus,
|
||||
};
|
||||
use polkadot_parachain_primitives::primitives::ValidationResult;
|
||||
@@ -88,19 +88,26 @@ pub enum Outcome {
|
||||
/// a trap. Errors related to the preparation process are not expected to be encountered by the
|
||||
/// execution workers.
|
||||
InvalidCandidate { err: String, idle_worker: IdleWorker },
|
||||
/// The execution time exceeded the hard limit. The worker is terminated.
|
||||
HardTimeout,
|
||||
/// An I/O error happened during communication with the worker. This may mean that the worker
|
||||
/// process already died. The token is not returned in any case.
|
||||
WorkerIntfErr,
|
||||
/// The job process has died. We must kill the worker just in case.
|
||||
///
|
||||
/// We cannot treat this as an internal error because malicious code may have caused this.
|
||||
JobDied { err: String },
|
||||
/// An unexpected error occurred in the job process.
|
||||
///
|
||||
/// Because malicious code can cause a job error, we must not treat it as an internal error.
|
||||
JobError { err: String },
|
||||
|
||||
/// An internal error happened during the validation. Such an error is most likely related to
|
||||
/// some transient glitch.
|
||||
///
|
||||
/// Should only ever be used for errors independent of the candidate and PVF. Therefore it may
|
||||
/// be a problem with the worker, so we terminate it.
|
||||
InternalError { err: InternalValidationError },
|
||||
/// The execution time exceeded the hard limit. The worker is terminated.
|
||||
HardTimeout,
|
||||
/// An I/O error happened during communication with the worker. This may mean that the worker
|
||||
/// process already died. The token is not returned in any case.
|
||||
IoErr,
|
||||
/// An unexpected panic has occurred in the execution worker.
|
||||
Panic { err: String },
|
||||
}
|
||||
|
||||
/// Given the idle token of a worker and parameters of work, communicates with the worker and
|
||||
@@ -137,7 +144,7 @@ pub async fn start_work(
|
||||
?error,
|
||||
"failed to send an execute request",
|
||||
);
|
||||
return Outcome::IoErr
|
||||
return Outcome::WorkerIntfErr
|
||||
}
|
||||
|
||||
// We use a generous timeout here. This is in addition to the one in the child process, in
|
||||
@@ -173,7 +180,7 @@ pub async fn start_work(
|
||||
);
|
||||
}
|
||||
|
||||
return Outcome::IoErr
|
||||
return Outcome::WorkerIntfErr
|
||||
},
|
||||
Ok(response) => {
|
||||
// Check if any syscall violations occurred during the job. For now this is
|
||||
@@ -189,7 +196,7 @@ pub async fn start_work(
|
||||
);
|
||||
}
|
||||
|
||||
if let Response::Ok{duration, ..} = response {
|
||||
if let WorkerResponse::Ok{duration, ..} = response {
|
||||
if duration > execution_timeout {
|
||||
// The job didn't complete within the timeout.
|
||||
gum::warn!(
|
||||
@@ -201,7 +208,7 @@ pub async fn start_work(
|
||||
);
|
||||
|
||||
// Return a timeout error.
|
||||
return Outcome::HardTimeout;
|
||||
return Outcome::HardTimeout
|
||||
}
|
||||
}
|
||||
|
||||
@@ -216,23 +223,25 @@ pub async fn start_work(
|
||||
validation_code_hash = ?artifact.id.code_hash,
|
||||
"execution worker exceeded lenient timeout for execution, child worker likely stalled",
|
||||
);
|
||||
Response::TimedOut
|
||||
WorkerResponse::JobTimedOut
|
||||
},
|
||||
};
|
||||
|
||||
match response {
|
||||
Response::Ok { result_descriptor, duration } => Outcome::Ok {
|
||||
WorkerResponse::Ok { result_descriptor, duration } => Outcome::Ok {
|
||||
result_descriptor,
|
||||
duration,
|
||||
idle_worker: IdleWorker { stream, pid, worker_dir },
|
||||
},
|
||||
Response::InvalidCandidate(err) => Outcome::InvalidCandidate {
|
||||
WorkerResponse::InvalidCandidate(err) => Outcome::InvalidCandidate {
|
||||
err,
|
||||
idle_worker: IdleWorker { stream, pid, worker_dir },
|
||||
},
|
||||
Response::TimedOut => Outcome::HardTimeout,
|
||||
Response::Panic(err) => Outcome::Panic { err },
|
||||
Response::InternalError(err) => Outcome::InternalError { err },
|
||||
WorkerResponse::JobTimedOut => Outcome::HardTimeout,
|
||||
WorkerResponse::JobDied(err) => Outcome::JobDied { err },
|
||||
WorkerResponse::JobError(err) => Outcome::JobError { err },
|
||||
|
||||
WorkerResponse::InternalError(err) => Outcome::InternalError { err },
|
||||
}
|
||||
})
|
||||
.await
|
||||
@@ -306,9 +315,9 @@ async fn send_request(
|
||||
framed_send(stream, &execution_timeout.encode()).await
|
||||
}
|
||||
|
||||
async fn recv_response(stream: &mut UnixStream) -> io::Result<Response> {
|
||||
async fn recv_response(stream: &mut UnixStream) -> io::Result<WorkerResponse> {
|
||||
let response_bytes = framed_recv(stream).await?;
|
||||
Response::decode(&mut &response_bytes[..]).map_err(|e| {
|
||||
WorkerResponse::decode(&mut response_bytes.as_slice()).map_err(|e| {
|
||||
io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
format!("execute pvf recv_response: decode error: {:?}", e),
|
||||
|
||||
Reference in New Issue
Block a user