change prepare worker to use fork instead of threads (#1685)

Co-authored-by: Marcin S <marcin@realemail.net>
This commit is contained in:
jserrat
2023-11-14 14:50:18 -03:00
committed by GitHub
parent 3a87390b30
commit 54f84285bf
24 changed files with 1468 additions and 534 deletions
+11 -4
View File
@@ -342,20 +342,27 @@ fn handle_job_finish(
},
Outcome::InvalidCandidate { err, idle_worker } => (
Some(idle_worker),
Err(ValidationError::InvalidCandidate(InvalidCandidate::WorkerReportedError(err))),
Err(ValidationError::InvalidCandidate(InvalidCandidate::WorkerReportedInvalid(err))),
None,
),
Outcome::InternalError { err } => (None, Err(ValidationError::InternalError(err)), None),
// Either the worker or the job timed out. Kill the worker in either case. Treated as
// definitely-invalid, because if we timed out, there's no time left for a retry.
Outcome::HardTimeout =>
(None, Err(ValidationError::InvalidCandidate(InvalidCandidate::HardTimeout)), None),
// "Maybe invalid" errors (will retry).
Outcome::IoErr => (
Outcome::WorkerIntfErr => (
None,
Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousWorkerDeath)),
None,
),
Outcome::Panic { err } =>
(None, Err(ValidationError::InvalidCandidate(InvalidCandidate::Panic(err))), None),
Outcome::JobDied { err } => (
None,
Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousJobDeath(err))),
None,
),
Outcome::JobError { err } =>
(None, Err(ValidationError::InvalidCandidate(InvalidCandidate::JobError(err))), None),
};
queue.metrics.execute_finished();
@@ -30,7 +30,7 @@ use futures_timer::Delay;
use parity_scale_codec::{Decode, Encode};
use polkadot_node_core_pvf_common::{
error::InternalValidationError,
execute::{Handshake, Response},
execute::{Handshake, WorkerResponse},
worker_dir, SecurityStatus,
};
use polkadot_parachain_primitives::primitives::ValidationResult;
@@ -88,19 +88,26 @@ pub enum Outcome {
/// a trap. Errors related to the preparation process are not expected to be encountered by the
/// execution workers.
InvalidCandidate { err: String, idle_worker: IdleWorker },
/// The execution time exceeded the hard limit. The worker is terminated.
HardTimeout,
/// An I/O error happened during communication with the worker. This may mean that the worker
/// process already died. The token is not returned in any case.
WorkerIntfErr,
/// The job process has died. We must kill the worker just in case.
///
/// We cannot treat this as an internal error because malicious code may have caused this.
JobDied { err: String },
/// An unexpected error occurred in the job process.
///
/// Because malicious code can cause a job error, we must not treat it as an internal error.
JobError { err: String },
/// An internal error happened during the validation. Such an error is most likely related to
/// some transient glitch.
///
/// Should only ever be used for errors independent of the candidate and PVF. Therefore it may
/// be a problem with the worker, so we terminate it.
InternalError { err: InternalValidationError },
/// The execution time exceeded the hard limit. The worker is terminated.
HardTimeout,
/// An I/O error happened during communication with the worker. This may mean that the worker
/// process already died. The token is not returned in any case.
IoErr,
/// An unexpected panic has occurred in the execution worker.
Panic { err: String },
}
/// Given the idle token of a worker and parameters of work, communicates with the worker and
@@ -137,7 +144,7 @@ pub async fn start_work(
?error,
"failed to send an execute request",
);
return Outcome::IoErr
return Outcome::WorkerIntfErr
}
// We use a generous timeout here. This is in addition to the one in the child process, in
@@ -173,7 +180,7 @@ pub async fn start_work(
);
}
return Outcome::IoErr
return Outcome::WorkerIntfErr
},
Ok(response) => {
// Check if any syscall violations occurred during the job. For now this is
@@ -189,7 +196,7 @@ pub async fn start_work(
);
}
if let Response::Ok{duration, ..} = response {
if let WorkerResponse::Ok{duration, ..} = response {
if duration > execution_timeout {
// The job didn't complete within the timeout.
gum::warn!(
@@ -201,7 +208,7 @@ pub async fn start_work(
);
// Return a timeout error.
return Outcome::HardTimeout;
return Outcome::HardTimeout
}
}
@@ -216,23 +223,25 @@ pub async fn start_work(
validation_code_hash = ?artifact.id.code_hash,
"execution worker exceeded lenient timeout for execution, child worker likely stalled",
);
Response::TimedOut
WorkerResponse::JobTimedOut
},
};
match response {
Response::Ok { result_descriptor, duration } => Outcome::Ok {
WorkerResponse::Ok { result_descriptor, duration } => Outcome::Ok {
result_descriptor,
duration,
idle_worker: IdleWorker { stream, pid, worker_dir },
},
Response::InvalidCandidate(err) => Outcome::InvalidCandidate {
WorkerResponse::InvalidCandidate(err) => Outcome::InvalidCandidate {
err,
idle_worker: IdleWorker { stream, pid, worker_dir },
},
Response::TimedOut => Outcome::HardTimeout,
Response::Panic(err) => Outcome::Panic { err },
Response::InternalError(err) => Outcome::InternalError { err },
WorkerResponse::JobTimedOut => Outcome::HardTimeout,
WorkerResponse::JobDied(err) => Outcome::JobDied { err },
WorkerResponse::JobError(err) => Outcome::JobError { err },
WorkerResponse::InternalError(err) => Outcome::InternalError { err },
}
})
.await
@@ -306,9 +315,9 @@ async fn send_request(
framed_send(stream, &execution_timeout.encode()).await
}
async fn recv_response(stream: &mut UnixStream) -> io::Result<Response> {
async fn recv_response(stream: &mut UnixStream) -> io::Result<WorkerResponse> {
let response_bytes = framed_recv(stream).await?;
Response::decode(&mut &response_bytes[..]).map_err(|e| {
WorkerResponse::decode(&mut response_bytes.as_slice()).map_err(|e| {
io::Error::new(
io::ErrorKind::Other,
format!("execute pvf recv_response: decode error: {:?}", e),