change prepare worker to use fork instead of threads (#1685)

Co-authored-by: Marcin S <marcin@realemail.net>
This commit is contained in:
jserrat
2023-11-14 14:50:18 -03:00
committed by GitHub
parent 3a87390b30
commit 54f84285bf
24 changed files with 1468 additions and 534 deletions
+18 -3
View File
@@ -339,17 +339,17 @@ fn handle_mux(
spawned,
worker,
idle,
Err(PrepareError::CreateTmpFileErr(err)),
Err(PrepareError::CreateTmpFile(err)),
),
// Return `Concluded`, but do not kill the worker since the error was on the host
// side.
Outcome::RenameTmpFileErr { worker: idle, result: _, err, src, dest } =>
Outcome::RenameTmpFile { worker: idle, result: _, err, src, dest } =>
handle_concluded_no_rip(
from_pool,
spawned,
worker,
idle,
Err(PrepareError::RenameTmpFileErr { err, src, dest }),
Err(PrepareError::RenameTmpFile { err, src, dest }),
),
// Could not clear worker cache. Kill the worker so other jobs can't see the data.
Outcome::ClearWorkerDir { err } => {
@@ -387,6 +387,21 @@ fn handle_mux(
Ok(())
},
// The worker might still be usable, but we kill it just in case.
Outcome::JobDied(err) => {
if attempt_retire(metrics, spawned, worker) {
reply(
from_pool,
FromPool::Concluded {
worker,
rip: true,
result: Err(PrepareError::JobDied(err)),
},
)?;
}
Ok(())
},
Outcome::TimedOut => {
if attempt_retire(metrics, spawned, worker) {
reply(
@@ -79,7 +79,7 @@ pub enum Outcome {
CreateTmpFileErr { worker: IdleWorker, err: String },
/// The response from the worker is received, but the tmp file cannot be renamed (moved) to the
/// final destination location.
RenameTmpFileErr {
RenameTmpFile {
worker: IdleWorker,
result: PrepareResult,
err: String,
@@ -100,6 +100,10 @@ pub enum Outcome {
IoErr(String),
/// The worker ran out of memory and is aborting. The worker should be ripped.
OutOfMemory,
/// The preparation job process died, due to OOM, a seccomp violation, or some other factor.
///
/// The worker might still be usable, but we kill it just in case.
JobDied(String),
}
/// Given the idle token of a worker and parameters of work, communicates with the worker and
@@ -187,21 +191,6 @@ pub async fn start_work(
"failed to recv a prepare response: {:?}",
err,
);
// The worker died. Check if it was due to a seccomp violation.
//
// NOTE: Log, but don't change the outcome. Not all validators may have auditing
// enabled, so we don't want attackers to abuse a non-deterministic outcome.
for syscall in security::check_seccomp_violations_for_worker(audit_log_file, pid).await {
gum::error!(
target: LOG_TARGET,
worker_pid = %pid,
%syscall,
?pvf,
"A forbidden syscall was attempted! This is a violation of our seccomp security policy. Report an issue ASAP!"
);
}
Outcome::IoErr(err.to_string())
},
Err(_) => {
@@ -236,6 +225,7 @@ async fn handle_response(
Ok(result) => result,
// Timed out on the child. This should already be logged by the child.
Err(PrepareError::TimedOut) => return Outcome::TimedOut,
Err(PrepareError::JobDied(err)) => return Outcome::JobDied(err),
Err(PrepareError::OutOfMemory) => return Outcome::OutOfMemory,
Err(_) => return Outcome::Concluded { worker, result },
};
@@ -272,7 +262,7 @@ async fn handle_response(
artifact_path.display(),
err,
);
Outcome::RenameTmpFileErr {
Outcome::RenameTmpFile {
worker,
result,
err: format!("{:?}", err),