mirror of
https://github.com/pezkuwichain/pezkuwi-subxt.git
synced 2026-06-17 06:41:02 +00:00
Pvf refactor execute worker errors follow up (#4071)
follow up of https://github.com/paritytech/polkadot-sdk/pull/2604 closes https://github.com/paritytech/polkadot-sdk/pull/2604 - [x] take relevant changes from Marcin's PR - [x] extract common duplicate code for workers (low-hanging fruits) ~Some unpassed ci problems are more general and should be fixed in master (see https://github.com/paritytech/polkadot-sdk/pull/4074)~ Proposed labels: **T0-node**, **R0-silent**, **I4-refactor** ----- kusama address: FZXVQLqLbFV2otNXs6BMnNch54CFJ1idpWwjMb3Z8fTLQC6 --------- Co-authored-by: s0me0ne-unkn0wn <48632512+s0me0ne-unkn0wn@users.noreply.github.com>
This commit is contained in:
Generated
+2
-4
@@ -7575,9 +7575,9 @@ checksum = "884e2677b40cc8c339eaefcb701c32ef1fd2493d71118dc0ca4b6a736c93bd67"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "libc"
|
name = "libc"
|
||||||
version = "0.2.152"
|
version = "0.2.153"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "13e3bf6590cbc649f4d1a3eefc9d5d6eb746f5200ffb04e5e142700b8faa56e7"
|
checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "libflate"
|
name = "libflate"
|
||||||
@@ -13303,7 +13303,6 @@ dependencies = [
|
|||||||
"slotmap",
|
"slotmap",
|
||||||
"sp-core",
|
"sp-core",
|
||||||
"sp-maybe-compressed-blob",
|
"sp-maybe-compressed-blob",
|
||||||
"sp-wasm-interface 20.0.0",
|
|
||||||
"tempfile",
|
"tempfile",
|
||||||
"test-parachain-adder",
|
"test-parachain-adder",
|
||||||
"test-parachain-halt",
|
"test-parachain-halt",
|
||||||
@@ -13340,7 +13339,6 @@ name = "polkadot-node-core-pvf-common"
|
|||||||
version = "7.0.0"
|
version = "7.0.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"assert_matches",
|
"assert_matches",
|
||||||
"cfg-if",
|
|
||||||
"cpu-time",
|
"cpu-time",
|
||||||
"futures",
|
"futures",
|
||||||
"landlock",
|
"landlock",
|
||||||
|
|||||||
@@ -17,8 +17,7 @@ cfg-if = "1.0"
|
|||||||
futures = "0.3.30"
|
futures = "0.3.30"
|
||||||
futures-timer = "3.0.2"
|
futures-timer = "3.0.2"
|
||||||
gum = { package = "tracing-gum", path = "../../gum" }
|
gum = { package = "tracing-gum", path = "../../gum" }
|
||||||
is_executable = "1.0.1"
|
is_executable = { version = "1.0.1", optional = true }
|
||||||
libc = "0.2.152"
|
|
||||||
pin-project = "1.0.9"
|
pin-project = "1.0.9"
|
||||||
rand = "0.8.5"
|
rand = "0.8.5"
|
||||||
slotmap = "1.0"
|
slotmap = "1.0"
|
||||||
@@ -26,7 +25,9 @@ tempfile = "3.3.0"
|
|||||||
thiserror = { workspace = true }
|
thiserror = { workspace = true }
|
||||||
tokio = { version = "1.24.2", features = ["fs", "process"] }
|
tokio = { version = "1.24.2", features = ["fs", "process"] }
|
||||||
|
|
||||||
parity-scale-codec = { version = "3.6.1", default-features = false, features = ["derive"] }
|
parity-scale-codec = { version = "3.6.1", default-features = false, features = [
|
||||||
|
"derive",
|
||||||
|
] }
|
||||||
|
|
||||||
polkadot-parachain-primitives = { path = "../../../parachain" }
|
polkadot-parachain-primitives = { path = "../../../parachain" }
|
||||||
polkadot-core-primitives = { path = "../../../core-primitives" }
|
polkadot-core-primitives = { path = "../../../core-primitives" }
|
||||||
@@ -37,14 +38,16 @@ polkadot-node-subsystem = { path = "../../subsystem" }
|
|||||||
polkadot-primitives = { path = "../../../primitives" }
|
polkadot-primitives = { path = "../../../primitives" }
|
||||||
|
|
||||||
sp-core = { path = "../../../../substrate/primitives/core" }
|
sp-core = { path = "../../../../substrate/primitives/core" }
|
||||||
sp-wasm-interface = { path = "../../../../substrate/primitives/wasm-interface" }
|
sp-maybe-compressed-blob = { path = "../../../../substrate/primitives/maybe-compressed-blob", optional = true }
|
||||||
sp-maybe-compressed-blob = { path = "../../../../substrate/primitives/maybe-compressed-blob" }
|
|
||||||
polkadot-node-core-pvf-prepare-worker = { path = "prepare-worker", optional = true }
|
polkadot-node-core-pvf-prepare-worker = { path = "prepare-worker", optional = true }
|
||||||
polkadot-node-core-pvf-execute-worker = { path = "execute-worker", optional = true }
|
polkadot-node-core-pvf-execute-worker = { path = "execute-worker", optional = true }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
assert_matches = "1.4.0"
|
assert_matches = "1.4.0"
|
||||||
criterion = { version = "0.4.0", default-features = false, features = ["async_tokio", "cargo_bench_support"] }
|
criterion = { version = "0.4.0", default-features = false, features = [
|
||||||
|
"async_tokio",
|
||||||
|
"cargo_bench_support",
|
||||||
|
] }
|
||||||
hex-literal = "0.4.1"
|
hex-literal = "0.4.1"
|
||||||
|
|
||||||
polkadot-node-core-pvf-common = { path = "common", features = ["test-utils"] }
|
polkadot-node-core-pvf-common = { path = "common", features = ["test-utils"] }
|
||||||
@@ -57,6 +60,7 @@ adder = { package = "test-parachain-adder", path = "../../../parachain/test-para
|
|||||||
halt = { package = "test-parachain-halt", path = "../../../parachain/test-parachains/halt" }
|
halt = { package = "test-parachain-halt", path = "../../../parachain/test-parachains/halt" }
|
||||||
|
|
||||||
[target.'cfg(target_os = "linux")'.dev-dependencies]
|
[target.'cfg(target_os = "linux")'.dev-dependencies]
|
||||||
|
libc = "0.2.153"
|
||||||
procfs = "0.16.0"
|
procfs = "0.16.0"
|
||||||
rusty-fork = "0.3.0"
|
rusty-fork = "0.3.0"
|
||||||
sc-sysinfo = { path = "../../../../substrate/client/sysinfo" }
|
sc-sysinfo = { path = "../../../../substrate/client/sysinfo" }
|
||||||
@@ -70,6 +74,8 @@ ci-only-tests = []
|
|||||||
jemalloc-allocator = ["polkadot-node-core-pvf-common/jemalloc-allocator"]
|
jemalloc-allocator = ["polkadot-node-core-pvf-common/jemalloc-allocator"]
|
||||||
# This feature is used to export test code to other crates without putting it in the production build.
|
# This feature is used to export test code to other crates without putting it in the production build.
|
||||||
test-utils = [
|
test-utils = [
|
||||||
"polkadot-node-core-pvf-execute-worker",
|
"dep:is_executable",
|
||||||
"polkadot-node-core-pvf-prepare-worker",
|
"dep:polkadot-node-core-pvf-execute-worker",
|
||||||
|
"dep:polkadot-node-core-pvf-prepare-worker",
|
||||||
|
"dep:sp-maybe-compressed-blob",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -10,14 +10,16 @@ license.workspace = true
|
|||||||
workspace = true
|
workspace = true
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
cfg-if = "1.0"
|
|
||||||
cpu-time = "1.0.0"
|
cpu-time = "1.0.0"
|
||||||
futures = "0.3.30"
|
futures = "0.3.30"
|
||||||
gum = { package = "tracing-gum", path = "../../../gum" }
|
gum = { package = "tracing-gum", path = "../../../gum" }
|
||||||
libc = "0.2.152"
|
libc = "0.2.152"
|
||||||
|
nix = { version = "0.27.1", features = ["resource", "sched"] }
|
||||||
thiserror = { workspace = true }
|
thiserror = { workspace = true }
|
||||||
|
|
||||||
parity-scale-codec = { version = "3.6.1", default-features = false, features = ["derive"] }
|
parity-scale-codec = { version = "3.6.1", default-features = false, features = [
|
||||||
|
"derive",
|
||||||
|
] }
|
||||||
|
|
||||||
polkadot-parachain-primitives = { path = "../../../../parachain" }
|
polkadot-parachain-primitives = { path = "../../../../parachain" }
|
||||||
polkadot-primitives = { path = "../../../../primitives" }
|
polkadot-primitives = { path = "../../../../primitives" }
|
||||||
@@ -34,7 +36,6 @@ sp-tracing = { path = "../../../../../substrate/primitives/tracing" }
|
|||||||
|
|
||||||
[target.'cfg(target_os = "linux")'.dependencies]
|
[target.'cfg(target_os = "linux")'.dependencies]
|
||||||
landlock = "0.3.0"
|
landlock = "0.3.0"
|
||||||
nix = { version = "0.27.1", features = ["sched"] }
|
|
||||||
|
|
||||||
[target.'cfg(all(target_os = "linux", target_arch = "x86_64"))'.dependencies]
|
[target.'cfg(all(target_os = "linux", target_arch = "x86_64"))'.dependencies]
|
||||||
seccompiler = "0.4.0"
|
seccompiler = "0.4.0"
|
||||||
|
|||||||
@@ -136,6 +136,9 @@ pub enum InternalValidationError {
|
|||||||
/// Could not find or open compiled artifact file.
|
/// Could not find or open compiled artifact file.
|
||||||
#[error("validation: could not find or open compiled artifact file: {0}")]
|
#[error("validation: could not find or open compiled artifact file: {0}")]
|
||||||
CouldNotOpenFile(String),
|
CouldNotOpenFile(String),
|
||||||
|
/// Could not create a pipe between the worker and a child process.
|
||||||
|
#[error("validation: could not create pipe: {0}")]
|
||||||
|
CouldNotCreatePipe(String),
|
||||||
/// Host could not clear the worker cache after a job.
|
/// Host could not clear the worker cache after a job.
|
||||||
#[error("validation: host could not clear the worker cache ({path:?}) after a job: {err}")]
|
#[error("validation: host could not clear the worker cache ({path:?}) after a job: {err}")]
|
||||||
CouldNotClearWorkerDir {
|
CouldNotClearWorkerDir {
|
||||||
|
|||||||
@@ -30,35 +30,36 @@ pub struct Handshake {
|
|||||||
|
|
||||||
/// The response from the execution worker.
|
/// The response from the execution worker.
|
||||||
#[derive(Debug, Encode, Decode)]
|
#[derive(Debug, Encode, Decode)]
|
||||||
pub enum WorkerResponse {
|
pub struct WorkerResponse {
|
||||||
/// The job completed successfully.
|
/// The response from the execute job process.
|
||||||
Ok {
|
pub job_response: JobResponse,
|
||||||
/// The result of parachain validation.
|
/// The amount of CPU time taken by the job.
|
||||||
result_descriptor: ValidationResult,
|
pub duration: Duration,
|
||||||
/// The amount of CPU time taken by the job.
|
}
|
||||||
duration: Duration,
|
|
||||||
},
|
/// An error occurred in the worker process.
|
||||||
/// The candidate is invalid.
|
#[derive(thiserror::Error, Debug, Clone, Encode, Decode)]
|
||||||
InvalidCandidate(String),
|
pub enum WorkerError {
|
||||||
/// Instantiation of the WASM module instance failed during an execution.
|
|
||||||
/// Possibly related to local issues or dirty node update. May be retried with re-preparation.
|
|
||||||
RuntimeConstruction(String),
|
|
||||||
/// The job timed out.
|
/// The job timed out.
|
||||||
|
#[error("The job timed out")]
|
||||||
JobTimedOut,
|
JobTimedOut,
|
||||||
/// The job process has died. We must kill the worker just in case.
|
/// The job process has died. We must kill the worker just in case.
|
||||||
///
|
///
|
||||||
/// We cannot treat this as an internal error because malicious code may have killed the job.
|
/// We cannot treat this as an internal error because malicious code may have killed the job.
|
||||||
/// We still retry it, because in the non-malicious case it is likely spurious.
|
/// We still retry it, because in the non-malicious case it is likely spurious.
|
||||||
|
#[error("The job process (pid {job_pid}) has died: {err}")]
|
||||||
JobDied { err: String, job_pid: i32 },
|
JobDied { err: String, job_pid: i32 },
|
||||||
/// An unexpected error occurred in the job process, e.g. failing to spawn a thread, panic,
|
/// An unexpected error occurred in the job process, e.g. failing to spawn a thread, panic,
|
||||||
/// etc.
|
/// etc.
|
||||||
///
|
///
|
||||||
/// Because malicious code can cause a job error, we must not treat it as an internal error. We
|
/// Because malicious code can cause a job error, we must not treat it as an internal error. We
|
||||||
/// still retry it, because in the non-malicious case it is likely spurious.
|
/// still retry it, because in the non-malicious case it is likely spurious.
|
||||||
JobError(String),
|
#[error("An unexpected error occurred in the job process: {0}")]
|
||||||
|
JobError(#[from] JobError),
|
||||||
|
|
||||||
/// Some internal error occurred.
|
/// Some internal error occurred.
|
||||||
InternalError(InternalValidationError),
|
#[error("An internal error occurred: {0}")]
|
||||||
|
InternalError(#[from] InternalValidationError),
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The result of a job on the execution worker.
|
/// The result of a job on the execution worker.
|
||||||
@@ -101,7 +102,7 @@ impl JobResponse {
|
|||||||
/// An unexpected error occurred in the execution job process. Because this comes from the job,
|
/// An unexpected error occurred in the execution job process. Because this comes from the job,
|
||||||
/// which executes untrusted code, this error must likewise be treated as untrusted. That is, we
|
/// which executes untrusted code, this error must likewise be treated as untrusted. That is, we
|
||||||
/// cannot raise an internal error based on this.
|
/// cannot raise an internal error based on this.
|
||||||
#[derive(thiserror::Error, Debug, Encode, Decode)]
|
#[derive(thiserror::Error, Clone, Debug, Encode, Decode)]
|
||||||
pub enum JobError {
|
pub enum JobError {
|
||||||
#[error("The job timed out")]
|
#[error("The job timed out")]
|
||||||
TimedOut,
|
TimedOut,
|
||||||
@@ -114,4 +115,7 @@ pub enum JobError {
|
|||||||
CouldNotSpawnThread(String),
|
CouldNotSpawnThread(String),
|
||||||
#[error("An error occurred in the CPU time monitor thread: {0}")]
|
#[error("An error occurred in the CPU time monitor thread: {0}")]
|
||||||
CpuTimeMonitorThread(String),
|
CpuTimeMonitorThread(String),
|
||||||
|
/// Since the job can return any exit status it wants, we have to treat this as untrusted.
|
||||||
|
#[error("Unexpected exit status: {0}")]
|
||||||
|
UnexpectedExitStatus(i32),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -15,6 +15,7 @@
|
|||||||
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
|
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
//! Contains functionality related to PVFs that is shared by the PVF host and the PVF workers.
|
//! Contains functionality related to PVFs that is shared by the PVF host and the PVF workers.
|
||||||
|
#![deny(unused_crate_dependencies)]
|
||||||
|
|
||||||
pub mod error;
|
pub mod error;
|
||||||
pub mod execute;
|
pub mod execute;
|
||||||
|
|||||||
@@ -18,12 +18,7 @@ use crate::prepare::PrepareJobKind;
|
|||||||
use parity_scale_codec::{Decode, Encode};
|
use parity_scale_codec::{Decode, Encode};
|
||||||
use polkadot_parachain_primitives::primitives::ValidationCodeHash;
|
use polkadot_parachain_primitives::primitives::ValidationCodeHash;
|
||||||
use polkadot_primitives::ExecutorParams;
|
use polkadot_primitives::ExecutorParams;
|
||||||
use std::{
|
use std::{fmt, sync::Arc, time::Duration};
|
||||||
cmp::{Eq, PartialEq},
|
|
||||||
fmt,
|
|
||||||
sync::Arc,
|
|
||||||
time::Duration,
|
|
||||||
};
|
|
||||||
|
|
||||||
/// A struct that carries the exhaustive set of data to prepare an artifact out of plain
|
/// A struct that carries the exhaustive set of data to prepare an artifact out of plain
|
||||||
/// Wasm binary
|
/// Wasm binary
|
||||||
|
|||||||
@@ -18,10 +18,13 @@
|
|||||||
|
|
||||||
pub mod security;
|
pub mod security;
|
||||||
|
|
||||||
use crate::{framed_recv_blocking, SecurityStatus, WorkerHandshake, LOG_TARGET};
|
use crate::{
|
||||||
|
framed_recv_blocking, framed_send_blocking, SecurityStatus, WorkerHandshake, LOG_TARGET,
|
||||||
|
};
|
||||||
use cpu_time::ProcessTime;
|
use cpu_time::ProcessTime;
|
||||||
use futures::never::Never;
|
use futures::never::Never;
|
||||||
use parity_scale_codec::Decode;
|
use nix::{errno::Errno, sys::resource::Usage};
|
||||||
|
use parity_scale_codec::{Decode, Encode};
|
||||||
use std::{
|
use std::{
|
||||||
any::Any,
|
any::Any,
|
||||||
fmt::{self},
|
fmt::{self},
|
||||||
@@ -58,8 +61,6 @@ macro_rules! decl_worker_main {
|
|||||||
|
|
||||||
$crate::sp_tracing::try_init_simple();
|
$crate::sp_tracing::try_init_simple();
|
||||||
|
|
||||||
let worker_pid = std::process::id();
|
|
||||||
|
|
||||||
let args = std::env::args().collect::<Vec<_>>();
|
let args = std::env::args().collect::<Vec<_>>();
|
||||||
if args.len() == 1 {
|
if args.len() == 1 {
|
||||||
print_help($expected_command);
|
print_help($expected_command);
|
||||||
@@ -548,6 +549,81 @@ fn recv_worker_handshake(stream: &mut UnixStream) -> io::Result<WorkerHandshake>
|
|||||||
Ok(worker_handshake)
|
Ok(worker_handshake)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Calculate the total CPU time from the given `usage` structure, returned from
|
||||||
|
/// [`nix::sys::resource::getrusage`], and calculates the total CPU time spent, including both user
|
||||||
|
/// and system time.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// - `rusage`: Contains resource usage information.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
///
|
||||||
|
/// Returns a `Duration` representing the total CPU time.
|
||||||
|
pub fn get_total_cpu_usage(rusage: Usage) -> Duration {
|
||||||
|
let micros = (((rusage.user_time().tv_sec() + rusage.system_time().tv_sec()) * 1_000_000) +
|
||||||
|
(rusage.system_time().tv_usec() + rusage.user_time().tv_usec()) as i64) as u64;
|
||||||
|
|
||||||
|
return Duration::from_micros(micros)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get a job response.
|
||||||
|
pub fn recv_child_response<T>(
|
||||||
|
received_data: &mut io::BufReader<&[u8]>,
|
||||||
|
context: &'static str,
|
||||||
|
) -> io::Result<T>
|
||||||
|
where
|
||||||
|
T: Decode,
|
||||||
|
{
|
||||||
|
let response_bytes = framed_recv_blocking(received_data)?;
|
||||||
|
T::decode(&mut response_bytes.as_slice()).map_err(|e| {
|
||||||
|
io::Error::new(
|
||||||
|
io::ErrorKind::Other,
|
||||||
|
format!("{} pvf recv_child_response: decode error: {}", context, e),
|
||||||
|
)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn send_result<T, E>(
|
||||||
|
stream: &mut UnixStream,
|
||||||
|
result: Result<T, E>,
|
||||||
|
worker_info: &WorkerInfo,
|
||||||
|
) -> io::Result<()>
|
||||||
|
where
|
||||||
|
T: std::fmt::Debug,
|
||||||
|
E: std::fmt::Debug + std::fmt::Display,
|
||||||
|
Result<T, E>: Encode,
|
||||||
|
{
|
||||||
|
if let Err(ref err) = result {
|
||||||
|
gum::warn!(
|
||||||
|
target: LOG_TARGET,
|
||||||
|
?worker_info,
|
||||||
|
"worker: error occurred: {}",
|
||||||
|
err
|
||||||
|
);
|
||||||
|
}
|
||||||
|
gum::trace!(
|
||||||
|
target: LOG_TARGET,
|
||||||
|
?worker_info,
|
||||||
|
"worker: sending result to host: {:?}",
|
||||||
|
result
|
||||||
|
);
|
||||||
|
|
||||||
|
framed_send_blocking(stream, &result.encode()).map_err(|err| {
|
||||||
|
gum::warn!(
|
||||||
|
target: LOG_TARGET,
|
||||||
|
?worker_info,
|
||||||
|
"worker: error occurred sending result to host: {}",
|
||||||
|
err
|
||||||
|
);
|
||||||
|
err
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn stringify_errno(context: &'static str, errno: Errno) -> String {
|
||||||
|
format!("{}: {}: {}", context, errno, io::Error::last_os_error())
|
||||||
|
}
|
||||||
|
|
||||||
/// Functionality related to threads spawned by the workers.
|
/// Functionality related to threads spawned by the workers.
|
||||||
///
|
///
|
||||||
/// The motivation for this module is to coordinate worker threads without using async Rust.
|
/// The motivation for this module is to coordinate worker threads without using async Rust.
|
||||||
|
|||||||
@@ -16,6 +16,9 @@
|
|||||||
|
|
||||||
//! Contains the logic for executing PVFs. Used by the polkadot-execute-worker binary.
|
//! Contains the logic for executing PVFs. Used by the polkadot-execute-worker binary.
|
||||||
|
|
||||||
|
#![deny(unused_crate_dependencies)]
|
||||||
|
#![warn(missing_docs)]
|
||||||
|
|
||||||
pub use polkadot_node_core_pvf_common::{
|
pub use polkadot_node_core_pvf_common::{
|
||||||
error::ExecuteError, executor_interface::execute_artifact,
|
error::ExecuteError, executor_interface::execute_artifact,
|
||||||
};
|
};
|
||||||
@@ -36,11 +39,12 @@ use nix::{
|
|||||||
use parity_scale_codec::{Decode, Encode};
|
use parity_scale_codec::{Decode, Encode};
|
||||||
use polkadot_node_core_pvf_common::{
|
use polkadot_node_core_pvf_common::{
|
||||||
error::InternalValidationError,
|
error::InternalValidationError,
|
||||||
execute::{Handshake, JobError, JobResponse, JobResult, WorkerResponse},
|
execute::{Handshake, JobError, JobResponse, JobResult, WorkerError, WorkerResponse},
|
||||||
executor_interface::params_to_wasmtime_semantics,
|
executor_interface::params_to_wasmtime_semantics,
|
||||||
framed_recv_blocking, framed_send_blocking,
|
framed_recv_blocking, framed_send_blocking,
|
||||||
worker::{
|
worker::{
|
||||||
cpu_time_monitor_loop, pipe2_cloexec, run_worker, stringify_panic_payload,
|
cpu_time_monitor_loop, get_total_cpu_usage, pipe2_cloexec, recv_child_response, run_worker,
|
||||||
|
send_result, stringify_errno, stringify_panic_payload,
|
||||||
thread::{self, WaitOutcome},
|
thread::{self, WaitOutcome},
|
||||||
PipeFd, WorkerInfo, WorkerKind,
|
PipeFd, WorkerInfo, WorkerKind,
|
||||||
},
|
},
|
||||||
@@ -93,8 +97,14 @@ fn recv_request(stream: &mut UnixStream) -> io::Result<(Vec<u8>, Duration)> {
|
|||||||
Ok((params, execution_timeout))
|
Ok((params, execution_timeout))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn send_response(stream: &mut UnixStream, response: WorkerResponse) -> io::Result<()> {
|
/// Sends an error to the host and returns the original error wrapped in `io::Error`.
|
||||||
framed_send_blocking(stream, &response.encode())
|
macro_rules! map_and_send_err {
|
||||||
|
($error:expr, $err_constructor:expr, $stream:expr, $worker_info:expr) => {{
|
||||||
|
let err: WorkerError = $err_constructor($error.to_string()).into();
|
||||||
|
let io_err = io::Error::new(io::ErrorKind::Other, err.to_string());
|
||||||
|
let _ = send_result::<WorkerResponse, WorkerError>($stream, Err(err), $worker_info);
|
||||||
|
io_err
|
||||||
|
}};
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The entrypoint that the spawned execute worker should start with.
|
/// The entrypoint that the spawned execute worker should start with.
|
||||||
@@ -110,8 +120,6 @@ fn send_response(stream: &mut UnixStream, response: WorkerResponse) -> io::Resul
|
|||||||
/// check is not necessary.
|
/// check is not necessary.
|
||||||
///
|
///
|
||||||
/// - `worker_version`: see above
|
/// - `worker_version`: see above
|
||||||
///
|
|
||||||
/// - `security_status`: contains the detected status of security features.
|
|
||||||
pub fn worker_entrypoint(
|
pub fn worker_entrypoint(
|
||||||
socket_path: PathBuf,
|
socket_path: PathBuf,
|
||||||
worker_dir_path: PathBuf,
|
worker_dir_path: PathBuf,
|
||||||
@@ -127,13 +135,28 @@ pub fn worker_entrypoint(
|
|||||||
|mut stream, worker_info, security_status| {
|
|mut stream, worker_info, security_status| {
|
||||||
let artifact_path = worker_dir::execute_artifact(&worker_info.worker_dir_path);
|
let artifact_path = worker_dir::execute_artifact(&worker_info.worker_dir_path);
|
||||||
|
|
||||||
let Handshake { executor_params } = recv_execute_handshake(&mut stream)?;
|
let Handshake { executor_params } =
|
||||||
|
recv_execute_handshake(&mut stream).map_err(|e| {
|
||||||
|
map_and_send_err!(
|
||||||
|
e,
|
||||||
|
InternalValidationError::HostCommunication,
|
||||||
|
&mut stream,
|
||||||
|
worker_info
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
let executor_params: Arc<ExecutorParams> = Arc::new(executor_params);
|
let executor_params: Arc<ExecutorParams> = Arc::new(executor_params);
|
||||||
let execute_thread_stack_size = max_stack_size(&executor_params);
|
let execute_thread_stack_size = max_stack_size(&executor_params);
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
let (params, execution_timeout) = recv_request(&mut stream)?;
|
let (params, execution_timeout) = recv_request(&mut stream).map_err(|e| {
|
||||||
|
map_and_send_err!(
|
||||||
|
e,
|
||||||
|
InternalValidationError::HostCommunication,
|
||||||
|
&mut stream,
|
||||||
|
worker_info
|
||||||
|
)
|
||||||
|
})?;
|
||||||
gum::debug!(
|
gum::debug!(
|
||||||
target: LOG_TARGET,
|
target: LOG_TARGET,
|
||||||
?worker_info,
|
?worker_info,
|
||||||
@@ -143,27 +166,34 @@ pub fn worker_entrypoint(
|
|||||||
);
|
);
|
||||||
|
|
||||||
// Get the artifact bytes.
|
// Get the artifact bytes.
|
||||||
let compiled_artifact_blob = match std::fs::read(&artifact_path) {
|
let compiled_artifact_blob = std::fs::read(&artifact_path).map_err(|e| {
|
||||||
Ok(bytes) => bytes,
|
map_and_send_err!(
|
||||||
Err(err) => {
|
e,
|
||||||
let response = WorkerResponse::InternalError(
|
InternalValidationError::CouldNotOpenFile,
|
||||||
InternalValidationError::CouldNotOpenFile(err.to_string()),
|
&mut stream,
|
||||||
);
|
worker_info
|
||||||
send_response(&mut stream, response)?;
|
)
|
||||||
continue
|
})?;
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
let (pipe_read_fd, pipe_write_fd) = pipe2_cloexec()?;
|
let (pipe_read_fd, pipe_write_fd) = pipe2_cloexec().map_err(|e| {
|
||||||
|
map_and_send_err!(
|
||||||
|
e,
|
||||||
|
InternalValidationError::CouldNotCreatePipe,
|
||||||
|
&mut stream,
|
||||||
|
worker_info
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
let usage_before = match nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN) {
|
let usage_before = nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN)
|
||||||
Ok(usage) => usage,
|
.map_err(|errno| {
|
||||||
Err(errno) => {
|
let e = stringify_errno("getrusage before", errno);
|
||||||
let response = internal_error_from_errno("getrusage before", errno);
|
map_and_send_err!(
|
||||||
send_response(&mut stream, response)?;
|
e,
|
||||||
continue
|
InternalValidationError::Kernel,
|
||||||
},
|
&mut stream,
|
||||||
};
|
worker_info
|
||||||
|
)
|
||||||
|
})?;
|
||||||
let stream_fd = stream.as_raw_fd();
|
let stream_fd = stream.as_raw_fd();
|
||||||
|
|
||||||
let compiled_artifact_blob = Arc::new(compiled_artifact_blob);
|
let compiled_artifact_blob = Arc::new(compiled_artifact_blob);
|
||||||
@@ -222,7 +252,7 @@ pub fn worker_entrypoint(
|
|||||||
"worker: sending result to host: {:?}",
|
"worker: sending result to host: {:?}",
|
||||||
result
|
result
|
||||||
);
|
);
|
||||||
send_response(&mut stream, result)?;
|
send_result(&mut stream, result, worker_info)?;
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
@@ -270,7 +300,7 @@ fn handle_clone(
|
|||||||
worker_info: &WorkerInfo,
|
worker_info: &WorkerInfo,
|
||||||
have_unshare_newuser: bool,
|
have_unshare_newuser: bool,
|
||||||
usage_before: Usage,
|
usage_before: Usage,
|
||||||
) -> io::Result<WorkerResponse> {
|
) -> io::Result<Result<WorkerResponse, WorkerError>> {
|
||||||
use polkadot_node_core_pvf_common::worker::security;
|
use polkadot_node_core_pvf_common::worker::security;
|
||||||
|
|
||||||
// SAFETY: new process is spawned within a single threaded process. This invariant
|
// SAFETY: new process is spawned within a single threaded process. This invariant
|
||||||
@@ -301,7 +331,8 @@ fn handle_clone(
|
|||||||
usage_before,
|
usage_before,
|
||||||
execution_timeout,
|
execution_timeout,
|
||||||
),
|
),
|
||||||
Err(security::clone::Error::Clone(errno)) => Ok(internal_error_from_errno("clone", errno)),
|
Err(security::clone::Error::Clone(errno)) =>
|
||||||
|
Ok(Err(internal_error_from_errno("clone", errno))),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -316,7 +347,7 @@ fn handle_fork(
|
|||||||
execute_worker_stack_size: usize,
|
execute_worker_stack_size: usize,
|
||||||
worker_info: &WorkerInfo,
|
worker_info: &WorkerInfo,
|
||||||
usage_before: Usage,
|
usage_before: Usage,
|
||||||
) -> io::Result<WorkerResponse> {
|
) -> io::Result<Result<WorkerResponse, WorkerError>> {
|
||||||
// SAFETY: new process is spawned within a single threaded process. This invariant
|
// SAFETY: new process is spawned within a single threaded process. This invariant
|
||||||
// is enforced by tests.
|
// is enforced by tests.
|
||||||
match unsafe { nix::unistd::fork() } {
|
match unsafe { nix::unistd::fork() } {
|
||||||
@@ -338,7 +369,7 @@ fn handle_fork(
|
|||||||
usage_before,
|
usage_before,
|
||||||
execution_timeout,
|
execution_timeout,
|
||||||
),
|
),
|
||||||
Err(errno) => Ok(internal_error_from_errno("fork", errno)),
|
Err(errno) => Ok(Err(internal_error_from_errno("fork", errno))),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -483,11 +514,11 @@ fn handle_parent_process(
|
|||||||
job_pid: Pid,
|
job_pid: Pid,
|
||||||
usage_before: Usage,
|
usage_before: Usage,
|
||||||
timeout: Duration,
|
timeout: Duration,
|
||||||
) -> io::Result<WorkerResponse> {
|
) -> io::Result<Result<WorkerResponse, WorkerError>> {
|
||||||
// the read end will wait until all write ends have been closed,
|
// the read end will wait until all write ends have been closed,
|
||||||
// this drop is necessary to avoid deadlock
|
// this drop is necessary to avoid deadlock
|
||||||
if let Err(errno) = nix::unistd::close(pipe_write_fd) {
|
if let Err(errno) = nix::unistd::close(pipe_write_fd) {
|
||||||
return Ok(internal_error_from_errno("closing pipe write fd", errno));
|
return Ok(Err(internal_error_from_errno("closing pipe write fd", errno)));
|
||||||
};
|
};
|
||||||
|
|
||||||
// SAFETY: pipe_read_fd is an open and owned file descriptor at this point.
|
// SAFETY: pipe_read_fd is an open and owned file descriptor at this point.
|
||||||
@@ -512,7 +543,7 @@ fn handle_parent_process(
|
|||||||
|
|
||||||
let usage_after = match nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN) {
|
let usage_after = match nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN) {
|
||||||
Ok(usage) => usage,
|
Ok(usage) => usage,
|
||||||
Err(errno) => return Ok(internal_error_from_errno("getrusage after", errno)),
|
Err(errno) => return Ok(Err(internal_error_from_errno("getrusage after", errno))),
|
||||||
};
|
};
|
||||||
|
|
||||||
// Using `getrusage` is needed to check whether child has timedout since we cannot rely on
|
// Using `getrusage` is needed to check whether child has timedout since we cannot rely on
|
||||||
@@ -530,32 +561,25 @@ fn handle_parent_process(
|
|||||||
cpu_tv.as_millis(),
|
cpu_tv.as_millis(),
|
||||||
timeout.as_millis(),
|
timeout.as_millis(),
|
||||||
);
|
);
|
||||||
return Ok(WorkerResponse::JobTimedOut)
|
return Ok(Err(WorkerError::JobTimedOut))
|
||||||
}
|
}
|
||||||
|
|
||||||
match status {
|
match status {
|
||||||
Ok(WaitStatus::Exited(_, exit_status)) => {
|
Ok(WaitStatus::Exited(_, exit_status)) => {
|
||||||
let mut reader = io::BufReader::new(received_data.as_slice());
|
let mut reader = io::BufReader::new(received_data.as_slice());
|
||||||
let result = match recv_child_response(&mut reader) {
|
let result = recv_child_response(&mut reader, "execute")?;
|
||||||
Ok(result) => result,
|
|
||||||
Err(err) => return Ok(WorkerResponse::JobError(err.to_string())),
|
|
||||||
};
|
|
||||||
|
|
||||||
match result {
|
match result {
|
||||||
Ok(JobResponse::Ok { result_descriptor }) => {
|
Ok(job_response) => {
|
||||||
// The exit status should have been zero if no error occurred.
|
// The exit status should have been zero if no error occurred.
|
||||||
if exit_status != 0 {
|
if exit_status != 0 {
|
||||||
return Ok(WorkerResponse::JobError(format!(
|
return Ok(Err(WorkerError::JobError(JobError::UnexpectedExitStatus(
|
||||||
"unexpected exit status: {}",
|
exit_status,
|
||||||
exit_status
|
))));
|
||||||
)))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(WorkerResponse::Ok { result_descriptor, duration: cpu_tv })
|
Ok(Ok(WorkerResponse { job_response, duration: cpu_tv }))
|
||||||
},
|
},
|
||||||
Ok(JobResponse::InvalidCandidate(err)) => Ok(WorkerResponse::InvalidCandidate(err)),
|
|
||||||
Ok(JobResponse::RuntimeConstruction(err)) =>
|
|
||||||
Ok(WorkerResponse::RuntimeConstruction(err)),
|
|
||||||
Err(job_error) => {
|
Err(job_error) => {
|
||||||
gum::warn!(
|
gum::warn!(
|
||||||
target: LOG_TARGET,
|
target: LOG_TARGET,
|
||||||
@@ -565,9 +589,9 @@ fn handle_parent_process(
|
|||||||
job_error,
|
job_error,
|
||||||
);
|
);
|
||||||
if matches!(job_error, JobError::TimedOut) {
|
if matches!(job_error, JobError::TimedOut) {
|
||||||
Ok(WorkerResponse::JobTimedOut)
|
Ok(Err(WorkerError::JobTimedOut))
|
||||||
} else {
|
} else {
|
||||||
Ok(WorkerResponse::JobError(job_error.to_string()))
|
Ok(Err(WorkerError::JobError(job_error.into())))
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
@@ -576,50 +600,21 @@ fn handle_parent_process(
|
|||||||
//
|
//
|
||||||
// The job gets SIGSYS on seccomp violations, but this signal may have been sent for some
|
// The job gets SIGSYS on seccomp violations, but this signal may have been sent for some
|
||||||
// other reason, so we still need to check for seccomp violations elsewhere.
|
// other reason, so we still need to check for seccomp violations elsewhere.
|
||||||
Ok(WaitStatus::Signaled(_pid, signal, _core_dump)) => Ok(WorkerResponse::JobDied {
|
Ok(WaitStatus::Signaled(_pid, signal, _core_dump)) => Ok(Err(WorkerError::JobDied {
|
||||||
err: format!("received signal: {signal:?}"),
|
err: format!("received signal: {signal:?}"),
|
||||||
job_pid: job_pid.as_raw(),
|
job_pid: job_pid.as_raw(),
|
||||||
}),
|
})),
|
||||||
Err(errno) => Ok(internal_error_from_errno("waitpid", errno)),
|
Err(errno) => Ok(Err(internal_error_from_errno("waitpid", errno))),
|
||||||
|
|
||||||
// It is within an attacker's power to send an unexpected exit status. So we cannot treat
|
// It is within an attacker's power to send an unexpected exit status. So we cannot treat
|
||||||
// this as an internal error (which would make us abstain), but must vote against.
|
// this as an internal error (which would make us abstain), but must vote against.
|
||||||
Ok(unexpected_wait_status) => Ok(WorkerResponse::JobDied {
|
Ok(unexpected_wait_status) => Ok(Err(WorkerError::JobDied {
|
||||||
err: format!("unexpected status from wait: {unexpected_wait_status:?}"),
|
err: format!("unexpected status from wait: {unexpected_wait_status:?}"),
|
||||||
job_pid: job_pid.as_raw(),
|
job_pid: job_pid.as_raw(),
|
||||||
}),
|
})),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Calculate the total CPU time from the given `usage` structure, returned from
|
|
||||||
/// [`nix::sys::resource::getrusage`], and calculates the total CPU time spent, including both user
|
|
||||||
/// and system time.
|
|
||||||
///
|
|
||||||
/// # Arguments
|
|
||||||
///
|
|
||||||
/// - `rusage`: Contains resource usage information.
|
|
||||||
///
|
|
||||||
/// # Returns
|
|
||||||
///
|
|
||||||
/// Returns a `Duration` representing the total CPU time.
|
|
||||||
fn get_total_cpu_usage(rusage: Usage) -> Duration {
|
|
||||||
let micros = (((rusage.user_time().tv_sec() + rusage.system_time().tv_sec()) * 1_000_000) +
|
|
||||||
(rusage.system_time().tv_usec() + rusage.user_time().tv_usec()) as i64) as u64;
|
|
||||||
|
|
||||||
return Duration::from_micros(micros)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Get a job response.
|
|
||||||
fn recv_child_response(received_data: &mut io::BufReader<&[u8]>) -> io::Result<JobResult> {
|
|
||||||
let response_bytes = framed_recv_blocking(received_data)?;
|
|
||||||
JobResult::decode(&mut response_bytes.as_slice()).map_err(|e| {
|
|
||||||
io::Error::new(
|
|
||||||
io::ErrorKind::Other,
|
|
||||||
format!("execute pvf recv_child_response: decode error: {}", e),
|
|
||||||
)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Write a job response to the pipe and exit process after.
|
/// Write a job response to the pipe and exit process after.
|
||||||
///
|
///
|
||||||
/// # Arguments
|
/// # Arguments
|
||||||
@@ -638,15 +633,10 @@ fn send_child_response(pipe_write: &mut PipeFd, response: JobResult) -> ! {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn internal_error_from_errno(context: &'static str, errno: Errno) -> WorkerResponse {
|
fn internal_error_from_errno(context: &'static str, errno: Errno) -> WorkerError {
|
||||||
WorkerResponse::InternalError(InternalValidationError::Kernel(format!(
|
WorkerError::InternalError(InternalValidationError::Kernel(stringify_errno(context, errno)))
|
||||||
"{}: {}: {}",
|
|
||||||
context,
|
|
||||||
errno,
|
|
||||||
io::Error::last_os_error()
|
|
||||||
)))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn job_error_from_errno(context: &'static str, errno: Errno) -> JobResult {
|
fn job_error_from_errno(context: &'static str, errno: Errno) -> JobResult {
|
||||||
Err(JobError::Kernel(format!("{}: {}: {}", context, errno, io::Error::last_os_error())))
|
Err(JobError::Kernel(stringify_errno(context, errno)))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -26,7 +26,6 @@ const LOG_TARGET: &str = "parachain::pvf-prepare-worker";
|
|||||||
use crate::memory_stats::max_rss_stat::{extract_max_rss_stat, get_max_rss_thread};
|
use crate::memory_stats::max_rss_stat::{extract_max_rss_stat, get_max_rss_thread};
|
||||||
#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
|
#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
|
||||||
use crate::memory_stats::memory_tracker::{get_memory_tracker_loop_stats, memory_tracker_loop};
|
use crate::memory_stats::memory_tracker::{get_memory_tracker_loop_stats, memory_tracker_loop};
|
||||||
use libc;
|
|
||||||
use nix::{
|
use nix::{
|
||||||
errno::Errno,
|
errno::Errno,
|
||||||
sys::{
|
sys::{
|
||||||
@@ -48,7 +47,8 @@ use polkadot_node_core_pvf_common::{
|
|||||||
prepare::{MemoryStats, PrepareJobKind, PrepareStats, PrepareWorkerSuccess},
|
prepare::{MemoryStats, PrepareJobKind, PrepareStats, PrepareWorkerSuccess},
|
||||||
pvf::PvfPrepData,
|
pvf::PvfPrepData,
|
||||||
worker::{
|
worker::{
|
||||||
cpu_time_monitor_loop, run_worker, stringify_panic_payload,
|
cpu_time_monitor_loop, get_total_cpu_usage, recv_child_response, run_worker, send_result,
|
||||||
|
stringify_errno, stringify_panic_payload,
|
||||||
thread::{self, spawn_worker_thread, WaitOutcome},
|
thread::{self, spawn_worker_thread, WaitOutcome},
|
||||||
WorkerKind,
|
WorkerKind,
|
||||||
},
|
},
|
||||||
@@ -117,11 +117,6 @@ fn recv_request(stream: &mut UnixStream) -> io::Result<PvfPrepData> {
|
|||||||
Ok(pvf)
|
Ok(pvf)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Send a worker response.
|
|
||||||
fn send_response(stream: &mut UnixStream, result: PrepareWorkerResult) -> io::Result<()> {
|
|
||||||
framed_send_blocking(stream, &result.encode())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn start_memory_tracking(fd: RawFd, limit: Option<isize>) {
|
fn start_memory_tracking(fd: RawFd, limit: Option<isize>) {
|
||||||
unsafe {
|
unsafe {
|
||||||
// SAFETY: Inside the failure handler, the allocator is locked and no allocations or
|
// SAFETY: Inside the failure handler, the allocator is locked and no allocations or
|
||||||
@@ -178,8 +173,6 @@ fn end_memory_tracking() -> isize {
|
|||||||
///
|
///
|
||||||
/// - `worker_version`: see above
|
/// - `worker_version`: see above
|
||||||
///
|
///
|
||||||
/// - `security_status`: contains the detected status of security features.
|
|
||||||
///
|
|
||||||
/// # Flow
|
/// # Flow
|
||||||
///
|
///
|
||||||
/// This runs the following in a loop:
|
/// This runs the following in a loop:
|
||||||
@@ -233,8 +226,9 @@ pub fn worker_entrypoint(
|
|||||||
let usage_before = match nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN) {
|
let usage_before = match nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN) {
|
||||||
Ok(usage) => usage,
|
Ok(usage) => usage,
|
||||||
Err(errno) => {
|
Err(errno) => {
|
||||||
let result = Err(error_from_errno("getrusage before", errno));
|
let result: PrepareWorkerResult =
|
||||||
send_response(&mut stream, result)?;
|
Err(error_from_errno("getrusage before", errno));
|
||||||
|
send_result(&mut stream, result, worker_info)?;
|
||||||
continue
|
continue
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
@@ -294,7 +288,7 @@ pub fn worker_entrypoint(
|
|||||||
"worker: sending result to host: {:?}",
|
"worker: sending result to host: {:?}",
|
||||||
result
|
result
|
||||||
);
|
);
|
||||||
send_response(&mut stream, result)?;
|
send_result(&mut stream, result, worker_info)?;
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
@@ -666,7 +660,7 @@ fn handle_parent_process(
|
|||||||
match status {
|
match status {
|
||||||
Ok(WaitStatus::Exited(_pid, exit_status)) => {
|
Ok(WaitStatus::Exited(_pid, exit_status)) => {
|
||||||
let mut reader = io::BufReader::new(received_data.as_slice());
|
let mut reader = io::BufReader::new(received_data.as_slice());
|
||||||
let result = recv_child_response(&mut reader)
|
let result = recv_child_response(&mut reader, "prepare")
|
||||||
.map_err(|err| PrepareError::JobError(err.to_string()))?;
|
.map_err(|err| PrepareError::JobError(err.to_string()))?;
|
||||||
|
|
||||||
match result {
|
match result {
|
||||||
@@ -726,35 +720,6 @@ fn handle_parent_process(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Calculate the total CPU time from the given `usage` structure, returned from
|
|
||||||
/// [`nix::sys::resource::getrusage`], and calculates the total CPU time spent, including both user
|
|
||||||
/// and system time.
|
|
||||||
///
|
|
||||||
/// # Arguments
|
|
||||||
///
|
|
||||||
/// - `rusage`: Contains resource usage information.
|
|
||||||
///
|
|
||||||
/// # Returns
|
|
||||||
///
|
|
||||||
/// Returns a `Duration` representing the total CPU time.
|
|
||||||
fn get_total_cpu_usage(rusage: Usage) -> Duration {
|
|
||||||
let micros = (((rusage.user_time().tv_sec() + rusage.system_time().tv_sec()) * 1_000_000) +
|
|
||||||
(rusage.system_time().tv_usec() + rusage.user_time().tv_usec()) as i64) as u64;
|
|
||||||
|
|
||||||
return Duration::from_micros(micros)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Get a job response.
|
|
||||||
fn recv_child_response(received_data: &mut io::BufReader<&[u8]>) -> io::Result<JobResult> {
|
|
||||||
let response_bytes = framed_recv_blocking(received_data)?;
|
|
||||||
JobResult::decode(&mut response_bytes.as_slice()).map_err(|e| {
|
|
||||||
io::Error::new(
|
|
||||||
io::ErrorKind::Other,
|
|
||||||
format!("prepare pvf recv_child_response: decode error: {:?}", e),
|
|
||||||
)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Write a job response to the pipe and exit process after.
|
/// Write a job response to the pipe and exit process after.
|
||||||
///
|
///
|
||||||
/// # Arguments
|
/// # Arguments
|
||||||
@@ -774,7 +739,7 @@ fn send_child_response(pipe_write: &mut PipeFd, response: JobResult) -> ! {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn error_from_errno(context: &'static str, errno: Errno) -> PrepareError {
|
fn error_from_errno(context: &'static str, errno: Errno) -> PrepareError {
|
||||||
PrepareError::Kernel(format!("{}: {}: {}", context, errno, io::Error::last_os_error()))
|
PrepareError::Kernel(stringify_errno(context, errno))
|
||||||
}
|
}
|
||||||
|
|
||||||
type JobResult = Result<JobResponse, PrepareError>;
|
type JobResult = Result<JobResponse, PrepareError>;
|
||||||
|
|||||||
@@ -16,7 +16,7 @@
|
|||||||
|
|
||||||
//! A queue that handles requests for PVF execution.
|
//! A queue that handles requests for PVF execution.
|
||||||
|
|
||||||
use super::worker_interface::Outcome;
|
use super::worker_interface::{Error as WorkerInterfaceError, Response as WorkerInterfaceResponse};
|
||||||
use crate::{
|
use crate::{
|
||||||
artifacts::{ArtifactId, ArtifactPathId},
|
artifacts::{ArtifactId, ArtifactPathId},
|
||||||
host::ResultSender,
|
host::ResultSender,
|
||||||
@@ -30,7 +30,10 @@ use futures::{
|
|||||||
stream::{FuturesUnordered, StreamExt as _},
|
stream::{FuturesUnordered, StreamExt as _},
|
||||||
Future, FutureExt,
|
Future, FutureExt,
|
||||||
};
|
};
|
||||||
use polkadot_node_core_pvf_common::SecurityStatus;
|
use polkadot_node_core_pvf_common::{
|
||||||
|
execute::{JobResponse, WorkerError, WorkerResponse},
|
||||||
|
SecurityStatus,
|
||||||
|
};
|
||||||
use polkadot_primitives::{ExecutorParams, ExecutorParamsHash};
|
use polkadot_primitives::{ExecutorParams, ExecutorParamsHash};
|
||||||
use slotmap::HopSlotMap;
|
use slotmap::HopSlotMap;
|
||||||
use std::{
|
use std::{
|
||||||
@@ -133,7 +136,12 @@ impl Workers {
|
|||||||
|
|
||||||
enum QueueEvent {
|
enum QueueEvent {
|
||||||
Spawn(IdleWorker, WorkerHandle, ExecuteJob),
|
Spawn(IdleWorker, WorkerHandle, ExecuteJob),
|
||||||
StartWork(Worker, Outcome, ArtifactId, ResultSender),
|
StartWork(
|
||||||
|
Worker,
|
||||||
|
Result<WorkerInterfaceResponse, WorkerInterfaceError>,
|
||||||
|
ArtifactId,
|
||||||
|
ResultSender,
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
type Mux = FuturesUnordered<BoxFuture<'static, QueueEvent>>;
|
type Mux = FuturesUnordered<BoxFuture<'static, QueueEvent>>;
|
||||||
@@ -340,23 +348,34 @@ fn handle_worker_spawned(
|
|||||||
async fn handle_job_finish(
|
async fn handle_job_finish(
|
||||||
queue: &mut Queue,
|
queue: &mut Queue,
|
||||||
worker: Worker,
|
worker: Worker,
|
||||||
outcome: Outcome,
|
worker_result: Result<WorkerInterfaceResponse, WorkerInterfaceError>,
|
||||||
artifact_id: ArtifactId,
|
artifact_id: ArtifactId,
|
||||||
result_tx: ResultSender,
|
result_tx: ResultSender,
|
||||||
) {
|
) {
|
||||||
let (idle_worker, result, duration, sync_channel) = match outcome {
|
let (idle_worker, result, duration, sync_channel) = match worker_result {
|
||||||
Outcome::Ok { result_descriptor, duration, idle_worker } => {
|
Ok(WorkerInterfaceResponse {
|
||||||
|
worker_response:
|
||||||
|
WorkerResponse { job_response: JobResponse::Ok { result_descriptor }, duration },
|
||||||
|
idle_worker,
|
||||||
|
}) => {
|
||||||
// TODO: propagate the soft timeout
|
// TODO: propagate the soft timeout
|
||||||
|
|
||||||
(Some(idle_worker), Ok(result_descriptor), Some(duration), None)
|
(Some(idle_worker), Ok(result_descriptor), Some(duration), None)
|
||||||
},
|
},
|
||||||
Outcome::InvalidCandidate { err, idle_worker } => (
|
Ok(WorkerInterfaceResponse {
|
||||||
|
worker_response: WorkerResponse { job_response: JobResponse::InvalidCandidate(err), .. },
|
||||||
|
idle_worker,
|
||||||
|
}) => (
|
||||||
Some(idle_worker),
|
Some(idle_worker),
|
||||||
Err(ValidationError::Invalid(InvalidCandidate::WorkerReportedInvalid(err))),
|
Err(ValidationError::Invalid(InvalidCandidate::WorkerReportedInvalid(err))),
|
||||||
None,
|
None,
|
||||||
None,
|
None,
|
||||||
),
|
),
|
||||||
Outcome::RuntimeConstruction { err, idle_worker } => {
|
Ok(WorkerInterfaceResponse {
|
||||||
|
worker_response:
|
||||||
|
WorkerResponse { job_response: JobResponse::RuntimeConstruction(err), .. },
|
||||||
|
idle_worker,
|
||||||
|
}) => {
|
||||||
// The task for artifact removal is executed concurrently with
|
// The task for artifact removal is executed concurrently with
|
||||||
// the message to the host on the execution result.
|
// the message to the host on the execution result.
|
||||||
let (result_tx, result_rx) = oneshot::channel();
|
let (result_tx, result_rx) = oneshot::channel();
|
||||||
@@ -376,27 +395,31 @@ async fn handle_job_finish(
|
|||||||
Some(result_rx),
|
Some(result_rx),
|
||||||
)
|
)
|
||||||
},
|
},
|
||||||
Outcome::InternalError { err } => (None, Err(ValidationError::Internal(err)), None, None),
|
|
||||||
|
Err(WorkerInterfaceError::InternalError(err)) |
|
||||||
|
Err(WorkerInterfaceError::WorkerError(WorkerError::InternalError(err))) =>
|
||||||
|
(None, Err(ValidationError::Internal(err)), None, None),
|
||||||
// Either the worker or the job timed out. Kill the worker in either case. Treated as
|
// Either the worker or the job timed out. Kill the worker in either case. Treated as
|
||||||
// definitely-invalid, because if we timed out, there's no time left for a retry.
|
// definitely-invalid, because if we timed out, there's no time left for a retry.
|
||||||
Outcome::HardTimeout =>
|
Err(WorkerInterfaceError::HardTimeout) |
|
||||||
|
Err(WorkerInterfaceError::WorkerError(WorkerError::JobTimedOut)) =>
|
||||||
(None, Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)), None, None),
|
(None, Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)), None, None),
|
||||||
// "Maybe invalid" errors (will retry).
|
// "Maybe invalid" errors (will retry).
|
||||||
Outcome::WorkerIntfErr => (
|
Err(WorkerInterfaceError::CommunicationErr(_err)) => (
|
||||||
None,
|
None,
|
||||||
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousWorkerDeath)),
|
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousWorkerDeath)),
|
||||||
None,
|
None,
|
||||||
None,
|
None,
|
||||||
),
|
),
|
||||||
Outcome::JobDied { err } => (
|
Err(WorkerInterfaceError::WorkerError(WorkerError::JobDied { err, .. })) => (
|
||||||
None,
|
None,
|
||||||
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousJobDeath(err))),
|
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousJobDeath(err))),
|
||||||
None,
|
None,
|
||||||
None,
|
None,
|
||||||
),
|
),
|
||||||
Outcome::JobError { err } => (
|
Err(WorkerInterfaceError::WorkerError(WorkerError::JobError(err))) => (
|
||||||
None,
|
None,
|
||||||
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(err))),
|
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(err.to_string()))),
|
||||||
None,
|
None,
|
||||||
None,
|
None,
|
||||||
),
|
),
|
||||||
@@ -543,14 +566,14 @@ fn assign(queue: &mut Queue, worker: Worker, job: ExecuteJob) {
|
|||||||
queue.mux.push(
|
queue.mux.push(
|
||||||
async move {
|
async move {
|
||||||
let _timer = execution_timer;
|
let _timer = execution_timer;
|
||||||
let outcome = super::worker_interface::start_work(
|
let result = super::worker_interface::start_work(
|
||||||
idle,
|
idle,
|
||||||
job.artifact.clone(),
|
job.artifact.clone(),
|
||||||
job.exec_timeout,
|
job.exec_timeout,
|
||||||
job.params,
|
job.params,
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
QueueEvent::StartWork(worker, outcome, job.artifact.id, job.result_tx)
|
QueueEvent::StartWork(worker, result, job.artifact.id, job.result_tx)
|
||||||
}
|
}
|
||||||
.boxed(),
|
.boxed(),
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -29,10 +29,9 @@ use futures_timer::Delay;
|
|||||||
use parity_scale_codec::{Decode, Encode};
|
use parity_scale_codec::{Decode, Encode};
|
||||||
use polkadot_node_core_pvf_common::{
|
use polkadot_node_core_pvf_common::{
|
||||||
error::InternalValidationError,
|
error::InternalValidationError,
|
||||||
execute::{Handshake, WorkerResponse},
|
execute::{Handshake, WorkerError, WorkerResponse},
|
||||||
worker_dir, SecurityStatus,
|
worker_dir, SecurityStatus,
|
||||||
};
|
};
|
||||||
use polkadot_parachain_primitives::primitives::ValidationResult;
|
|
||||||
use polkadot_primitives::ExecutorParams;
|
use polkadot_primitives::ExecutorParams;
|
||||||
use std::{path::Path, time::Duration};
|
use std::{path::Path, time::Duration};
|
||||||
use tokio::{io, net::UnixStream};
|
use tokio::{io, net::UnixStream};
|
||||||
@@ -69,7 +68,8 @@ pub async fn spawn(
|
|||||||
gum::warn!(
|
gum::warn!(
|
||||||
target: LOG_TARGET,
|
target: LOG_TARGET,
|
||||||
worker_pid = %idle_worker.pid,
|
worker_pid = %idle_worker.pid,
|
||||||
%err
|
"failed to send a handshake to the spawned worker: {}",
|
||||||
|
error
|
||||||
);
|
);
|
||||||
err
|
err
|
||||||
})?;
|
})?;
|
||||||
@@ -78,39 +78,40 @@ pub async fn spawn(
|
|||||||
|
|
||||||
/// Outcome of PVF execution.
|
/// Outcome of PVF execution.
|
||||||
///
|
///
|
||||||
/// If the idle worker token is not returned, it means the worker must be terminated.
|
/// PVF execution completed and the result is returned. The worker is ready for
|
||||||
pub enum Outcome {
|
/// another job.
|
||||||
/// PVF execution completed successfully and the result is returned. The worker is ready for
|
pub struct Response {
|
||||||
/// another job.
|
/// The response (valid/invalid) from the worker.
|
||||||
Ok { result_descriptor: ValidationResult, duration: Duration, idle_worker: IdleWorker },
|
pub worker_response: WorkerResponse,
|
||||||
/// The candidate validation failed. It may be for example because the wasm execution triggered
|
/// Returning the idle worker token means the worker can be reused.
|
||||||
/// a trap. Errors related to the preparation process are not expected to be encountered by the
|
pub idle_worker: IdleWorker,
|
||||||
/// execution workers.
|
}
|
||||||
InvalidCandidate { err: String, idle_worker: IdleWorker },
|
/// The idle worker token is not returned for any of these cases, meaning the worker must be
|
||||||
/// The error is probably transient. It may be for example
|
/// terminated.
|
||||||
/// because the artifact was prepared with a Wasmtime version different from the version
|
///
|
||||||
/// in the current execution environment.
|
/// NOTE: Errors related to the preparation process are not expected to be encountered by the
|
||||||
RuntimeConstruction { err: String, idle_worker: IdleWorker },
|
/// execution workers.
|
||||||
|
#[derive(thiserror::Error, Debug)]
|
||||||
|
pub enum Error {
|
||||||
/// The execution time exceeded the hard limit. The worker is terminated.
|
/// The execution time exceeded the hard limit. The worker is terminated.
|
||||||
|
#[error("The communication with the worker exceeded the hard limit")]
|
||||||
HardTimeout,
|
HardTimeout,
|
||||||
/// An I/O error happened during communication with the worker. This may mean that the worker
|
/// An I/O error happened during communication with the worker. This may mean that the worker
|
||||||
/// process already died. The token is not returned in any case.
|
/// process already died. The token is not returned in any case.
|
||||||
WorkerIntfErr,
|
#[error("An I/O error happened during communication with the worker: {0}")]
|
||||||
/// The job process has died. We must kill the worker just in case.
|
CommunicationErr(#[from] io::Error),
|
||||||
///
|
/// The worker reported an error (can be from itself or from the job). The worker should not be
|
||||||
/// We cannot treat this as an internal error because malicious code may have caused this.
|
/// reused.
|
||||||
JobDied { err: String },
|
#[error("The worker reported an error: {0}")]
|
||||||
/// An unexpected error occurred in the job process.
|
WorkerError(#[from] WorkerError),
|
||||||
///
|
|
||||||
/// Because malicious code can cause a job error, we must not treat it as an internal error.
|
|
||||||
JobError { err: String },
|
|
||||||
|
|
||||||
/// An internal error happened during the validation. Such an error is most likely related to
|
/// An internal error happened during the validation. Such an error is most likely related to
|
||||||
/// some transient glitch.
|
/// some transient glitch.
|
||||||
///
|
///
|
||||||
/// Should only ever be used for errors independent of the candidate and PVF. Therefore it may
|
/// Should only ever be used for errors independent of the candidate and PVF. Therefore it may
|
||||||
/// be a problem with the worker, so we terminate it.
|
/// be a problem with the worker, so we terminate it.
|
||||||
InternalError { err: InternalValidationError },
|
#[error("An internal error occurred: {0}")]
|
||||||
|
InternalError(#[from] InternalValidationError),
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Given the idle token of a worker and parameters of work, communicates with the worker and
|
/// Given the idle token of a worker and parameters of work, communicates with the worker and
|
||||||
@@ -123,7 +124,7 @@ pub async fn start_work(
|
|||||||
artifact: ArtifactPathId,
|
artifact: ArtifactPathId,
|
||||||
execution_timeout: Duration,
|
execution_timeout: Duration,
|
||||||
validation_params: Vec<u8>,
|
validation_params: Vec<u8>,
|
||||||
) -> Outcome {
|
) -> Result<Response, Error> {
|
||||||
let IdleWorker { mut stream, pid, worker_dir } = worker;
|
let IdleWorker { mut stream, pid, worker_dir } = worker;
|
||||||
|
|
||||||
gum::debug!(
|
gum::debug!(
|
||||||
@@ -136,16 +137,18 @@ pub async fn start_work(
|
|||||||
);
|
);
|
||||||
|
|
||||||
with_worker_dir_setup(worker_dir, pid, &artifact.path, |worker_dir| async move {
|
with_worker_dir_setup(worker_dir, pid, &artifact.path, |worker_dir| async move {
|
||||||
if let Err(error) = send_request(&mut stream, &validation_params, execution_timeout).await {
|
send_request(&mut stream, &validation_params, execution_timeout).await.map_err(
|
||||||
gum::warn!(
|
|error| {
|
||||||
target: LOG_TARGET,
|
gum::warn!(
|
||||||
worker_pid = %pid,
|
target: LOG_TARGET,
|
||||||
validation_code_hash = ?artifact.id.code_hash,
|
worker_pid = %pid,
|
||||||
?error,
|
validation_code_hash = ?artifact.id.code_hash,
|
||||||
"failed to send an execute request",
|
"failed to send an execute request: {}",
|
||||||
);
|
error,
|
||||||
return Outcome::WorkerIntfErr
|
);
|
||||||
}
|
Error::InternalError(InternalValidationError::HostCommunication(error.to_string()))
|
||||||
|
},
|
||||||
|
)?;
|
||||||
|
|
||||||
// We use a generous timeout here. This is in addition to the one in the child process, in
|
// We use a generous timeout here. This is in addition to the one in the child process, in
|
||||||
// case the child stalls. We have a wall clock timeout here in the host, but a CPU timeout
|
// case the child stalls. We have a wall clock timeout here in the host, but a CPU timeout
|
||||||
@@ -153,12 +156,12 @@ pub async fn start_work(
|
|||||||
// load, but the CPU resources of the child can only be measured from the parent after the
|
// load, but the CPU resources of the child can only be measured from the parent after the
|
||||||
// child process terminates.
|
// child process terminates.
|
||||||
let timeout = execution_timeout * JOB_TIMEOUT_WALL_CLOCK_FACTOR;
|
let timeout = execution_timeout * JOB_TIMEOUT_WALL_CLOCK_FACTOR;
|
||||||
let response = futures::select! {
|
let worker_result = futures::select! {
|
||||||
response = recv_response(&mut stream).fuse() => {
|
worker_result = recv_result(&mut stream).fuse() => {
|
||||||
match response {
|
match worker_result {
|
||||||
Ok(response) =>
|
Ok(result) =>
|
||||||
handle_response(
|
handle_result(
|
||||||
response,
|
result,
|
||||||
pid,
|
pid,
|
||||||
execution_timeout,
|
execution_timeout,
|
||||||
)
|
)
|
||||||
@@ -168,11 +171,11 @@ pub async fn start_work(
|
|||||||
target: LOG_TARGET,
|
target: LOG_TARGET,
|
||||||
worker_pid = %pid,
|
worker_pid = %pid,
|
||||||
validation_code_hash = ?artifact.id.code_hash,
|
validation_code_hash = ?artifact.id.code_hash,
|
||||||
?error,
|
"failed to recv an execute result: {}",
|
||||||
"failed to recv an execute response",
|
error,
|
||||||
);
|
);
|
||||||
|
|
||||||
return Outcome::WorkerIntfErr
|
return Err(Error::CommunicationErr(error))
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@@ -183,29 +186,16 @@ pub async fn start_work(
|
|||||||
validation_code_hash = ?artifact.id.code_hash,
|
validation_code_hash = ?artifact.id.code_hash,
|
||||||
"execution worker exceeded lenient timeout for execution, child worker likely stalled",
|
"execution worker exceeded lenient timeout for execution, child worker likely stalled",
|
||||||
);
|
);
|
||||||
WorkerResponse::JobTimedOut
|
return Err(Error::HardTimeout)
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
match response {
|
match worker_result {
|
||||||
WorkerResponse::Ok { result_descriptor, duration } => Outcome::Ok {
|
Ok(worker_response) => Ok(Response {
|
||||||
result_descriptor,
|
worker_response,
|
||||||
duration,
|
|
||||||
idle_worker: IdleWorker { stream, pid, worker_dir },
|
idle_worker: IdleWorker { stream, pid, worker_dir },
|
||||||
},
|
}),
|
||||||
WorkerResponse::InvalidCandidate(err) => Outcome::InvalidCandidate {
|
Err(worker_error) => Err(worker_error.into()),
|
||||||
err,
|
|
||||||
idle_worker: IdleWorker { stream, pid, worker_dir },
|
|
||||||
},
|
|
||||||
WorkerResponse::RuntimeConstruction(err) => Outcome::RuntimeConstruction {
|
|
||||||
err,
|
|
||||||
idle_worker: IdleWorker { stream, pid, worker_dir },
|
|
||||||
},
|
|
||||||
WorkerResponse::JobTimedOut => Outcome::HardTimeout,
|
|
||||||
WorkerResponse::JobDied { err, job_pid: _ } => Outcome::JobDied { err },
|
|
||||||
WorkerResponse::JobError(err) => Outcome::JobError { err },
|
|
||||||
|
|
||||||
WorkerResponse::InternalError(err) => Outcome::InternalError { err },
|
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.await
|
.await
|
||||||
@@ -215,12 +205,12 @@ pub async fn start_work(
|
|||||||
///
|
///
|
||||||
/// Here we know the artifact exists, but is still located in a temporary file which will be cleared
|
/// Here we know the artifact exists, but is still located in a temporary file which will be cleared
|
||||||
/// by [`with_worker_dir_setup`].
|
/// by [`with_worker_dir_setup`].
|
||||||
async fn handle_response(
|
async fn handle_result(
|
||||||
response: WorkerResponse,
|
worker_result: Result<WorkerResponse, WorkerError>,
|
||||||
worker_pid: u32,
|
worker_pid: u32,
|
||||||
execution_timeout: Duration,
|
execution_timeout: Duration,
|
||||||
) -> WorkerResponse {
|
) -> Result<WorkerResponse, WorkerError> {
|
||||||
if let WorkerResponse::Ok { duration, .. } = response {
|
if let Ok(WorkerResponse { duration, .. }) = worker_result {
|
||||||
if duration > execution_timeout {
|
if duration > execution_timeout {
|
||||||
// The job didn't complete within the timeout.
|
// The job didn't complete within the timeout.
|
||||||
gum::warn!(
|
gum::warn!(
|
||||||
@@ -232,11 +222,11 @@ async fn handle_response(
|
|||||||
);
|
);
|
||||||
|
|
||||||
// Return a timeout error.
|
// Return a timeout error.
|
||||||
return WorkerResponse::JobTimedOut
|
return Err(WorkerError::JobTimedOut)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
response
|
worker_result
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Create a temporary file for an artifact in the worker cache, execute the given future/closure
|
/// Create a temporary file for an artifact in the worker cache, execute the given future/closure
|
||||||
@@ -249,9 +239,9 @@ async fn with_worker_dir_setup<F, Fut>(
|
|||||||
pid: u32,
|
pid: u32,
|
||||||
artifact_path: &Path,
|
artifact_path: &Path,
|
||||||
f: F,
|
f: F,
|
||||||
) -> Outcome
|
) -> Result<Response, Error>
|
||||||
where
|
where
|
||||||
Fut: futures::Future<Output = Outcome>,
|
Fut: futures::Future<Output = Result<Response, Error>>,
|
||||||
F: FnOnce(WorkerDir) -> Fut,
|
F: FnOnce(WorkerDir) -> Fut,
|
||||||
{
|
{
|
||||||
// Cheaply create a hard link to the artifact. The artifact is always at a known location in the
|
// Cheaply create a hard link to the artifact. The artifact is always at a known location in the
|
||||||
@@ -263,16 +253,14 @@ where
|
|||||||
target: LOG_TARGET,
|
target: LOG_TARGET,
|
||||||
worker_pid = %pid,
|
worker_pid = %pid,
|
||||||
?worker_dir,
|
?worker_dir,
|
||||||
"failed to clear worker cache after the job: {:?}",
|
"failed to clear worker cache after the job: {}",
|
||||||
err,
|
err,
|
||||||
);
|
);
|
||||||
return Outcome::InternalError {
|
return Err(InternalValidationError::CouldNotCreateLink(format!("{:?}", err)).into());
|
||||||
err: InternalValidationError::CouldNotCreateLink(format!("{:?}", err)),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let worker_dir_path = worker_dir.path().to_owned();
|
let worker_dir_path = worker_dir.path().to_owned();
|
||||||
let outcome = f(worker_dir).await;
|
let result = f(worker_dir).await;
|
||||||
|
|
||||||
// Try to clear the worker dir.
|
// Try to clear the worker dir.
|
||||||
if let Err(err) = clear_worker_dir_path(&worker_dir_path) {
|
if let Err(err) = clear_worker_dir_path(&worker_dir_path) {
|
||||||
@@ -283,15 +271,14 @@ where
|
|||||||
"failed to clear worker cache after the job: {:?}",
|
"failed to clear worker cache after the job: {:?}",
|
||||||
err,
|
err,
|
||||||
);
|
);
|
||||||
return Outcome::InternalError {
|
return Err(InternalValidationError::CouldNotClearWorkerDir {
|
||||||
err: InternalValidationError::CouldNotClearWorkerDir {
|
err: format!("{:?}", err),
|
||||||
err: format!("{:?}", err),
|
path: worker_dir_path.to_str().map(String::from),
|
||||||
path: worker_dir_path.to_str().map(String::from),
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
.into())
|
||||||
}
|
}
|
||||||
|
|
||||||
outcome
|
result
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Sends a handshake with information specific to the execute worker.
|
/// Sends a handshake with information specific to the execute worker.
|
||||||
@@ -308,12 +295,12 @@ async fn send_request(
|
|||||||
framed_send(stream, &execution_timeout.encode()).await
|
framed_send(stream, &execution_timeout.encode()).await
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn recv_response(stream: &mut UnixStream) -> io::Result<WorkerResponse> {
|
async fn recv_result(stream: &mut UnixStream) -> io::Result<Result<WorkerResponse, WorkerError>> {
|
||||||
let response_bytes = framed_recv(stream).await?;
|
let result_bytes = framed_recv(stream).await?;
|
||||||
WorkerResponse::decode(&mut response_bytes.as_slice()).map_err(|e| {
|
Result::<WorkerResponse, WorkerError>::decode(&mut result_bytes.as_slice()).map_err(|e| {
|
||||||
io::Error::new(
|
io::Error::new(
|
||||||
io::ErrorKind::Other,
|
io::ErrorKind::Other,
|
||||||
format!("execute pvf recv_response: decode error: {:?}", e),
|
format!("execute pvf recv_result: decode error: {:?}", e),
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -959,10 +959,7 @@ pub(crate) mod tests {
|
|||||||
use crate::{artifacts::generate_artifact_path, PossiblyInvalidError};
|
use crate::{artifacts::generate_artifact_path, PossiblyInvalidError};
|
||||||
use assert_matches::assert_matches;
|
use assert_matches::assert_matches;
|
||||||
use futures::future::BoxFuture;
|
use futures::future::BoxFuture;
|
||||||
use polkadot_node_core_pvf_common::{
|
use polkadot_node_core_pvf_common::prepare::PrepareStats;
|
||||||
error::PrepareError,
|
|
||||||
prepare::{PrepareStats, PrepareSuccess},
|
|
||||||
};
|
|
||||||
|
|
||||||
const TEST_EXECUTION_TIMEOUT: Duration = Duration::from_secs(3);
|
const TEST_EXECUTION_TIMEOUT: Duration = Duration::from_secs(3);
|
||||||
pub(crate) const TEST_PREPARATION_TIMEOUT: Duration = Duration::from_secs(30);
|
pub(crate) const TEST_PREPARATION_TIMEOUT: Duration = Duration::from_secs(30);
|
||||||
|
|||||||
Reference in New Issue
Block a user