mirror of
https://github.com/pezkuwichain/pezkuwi-subxt.git
synced 2026-06-13 07:01:05 +00:00
PVF: Move PVF workers into separate crate (#7101)
* Move PVF workers into separate crate * Fix indentation * Fix compilation errors * Fix more compilation errors * Rename `worker.rs` files, make host interface to worker more clear * Fix more compilation errors * Fix more compilation errors * Add link to issue * Address review comments * Update comment
This commit is contained in:
@@ -18,10 +18,10 @@
|
||||
//!
|
||||
//! The validation host [runs the queue][`start`] communicating with it by sending [`ToQueue`]
|
||||
//! messages. The queue will spawn workers in new processes. Those processes should jump to
|
||||
//! [`worker_entrypoint`].
|
||||
//! `polkadot_node_core_pvf_worker::execute_worker_entrypoint`.
|
||||
|
||||
mod queue;
|
||||
mod worker;
|
||||
mod worker_intf;
|
||||
|
||||
pub use queue::{start, PendingExecutionRequest, ToQueue};
|
||||
pub use worker::{worker_entrypoint, Response as ExecuteResponse};
|
||||
pub use worker_intf::{Handshake as ExecuteHandshake, Response as ExecuteResponse};
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
|
||||
//! A queue that handles requests for PVF execution.
|
||||
|
||||
use super::worker::Outcome;
|
||||
use super::worker_intf::Outcome;
|
||||
use crate::{
|
||||
artifacts::{ArtifactId, ArtifactPathId},
|
||||
host::ResultSender,
|
||||
@@ -416,7 +416,8 @@ async fn spawn_worker_task(
|
||||
use futures_timer::Delay;
|
||||
|
||||
loop {
|
||||
match super::worker::spawn(&program_path, job.executor_params.clone(), spawn_timeout).await
|
||||
match super::worker_intf::spawn(&program_path, job.executor_params.clone(), spawn_timeout)
|
||||
.await
|
||||
{
|
||||
Ok((idle, handle)) => break QueueEvent::Spawn(idle, handle, job),
|
||||
Err(err) => {
|
||||
@@ -460,9 +461,13 @@ fn assign(queue: &mut Queue, worker: Worker, job: ExecuteJob) {
|
||||
queue.mux.push(
|
||||
async move {
|
||||
let _timer = execution_timer;
|
||||
let outcome =
|
||||
super::worker::start_work(idle, job.artifact.clone(), job.exec_timeout, job.params)
|
||||
.await;
|
||||
let outcome = super::worker_intf::start_work(
|
||||
idle,
|
||||
job.artifact.clone(),
|
||||
job.exec_timeout,
|
||||
job.params,
|
||||
)
|
||||
.await;
|
||||
QueueEvent::StartWork(worker, outcome, job.artifact.id, job.result_tx)
|
||||
}
|
||||
.boxed(),
|
||||
|
||||
+26
-157
@@ -14,28 +14,23 @@
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//! Host interface to the execute worker.
|
||||
|
||||
use crate::{
|
||||
artifacts::ArtifactPathId,
|
||||
executor_intf::Executor,
|
||||
worker_common::{
|
||||
bytes_to_path, cpu_time_monitor_loop, framed_recv, framed_send, path_to_bytes,
|
||||
spawn_with_program_path, worker_event_loop, IdleWorker, SpawnErr, WorkerHandle,
|
||||
JOB_TIMEOUT_WALL_CLOCK_FACTOR,
|
||||
framed_recv, framed_send, path_to_bytes, spawn_with_program_path, IdleWorker, SpawnErr,
|
||||
WorkerHandle, JOB_TIMEOUT_WALL_CLOCK_FACTOR,
|
||||
},
|
||||
LOG_TARGET,
|
||||
};
|
||||
use cpu_time::ProcessTime;
|
||||
use futures::{pin_mut, select_biased, FutureExt};
|
||||
use futures::FutureExt;
|
||||
use futures_timer::Delay;
|
||||
use parity_scale_codec::{Decode, Encode};
|
||||
|
||||
use polkadot_parachain::primitives::ValidationResult;
|
||||
use polkadot_primitives::ExecutorParams;
|
||||
use std::{
|
||||
path::{Path, PathBuf},
|
||||
sync::{mpsc::channel, Arc},
|
||||
time::Duration,
|
||||
};
|
||||
use std::{path::Path, time::Duration};
|
||||
use tokio::{io, net::UnixStream};
|
||||
|
||||
/// Spawns a new worker with the given program path that acts as the worker and the spawn timeout.
|
||||
@@ -185,17 +180,6 @@ async fn send_handshake(stream: &mut UnixStream, handshake: Handshake) -> io::Re
|
||||
framed_send(stream, &handshake.encode()).await
|
||||
}
|
||||
|
||||
async fn recv_handshake(stream: &mut UnixStream) -> io::Result<Handshake> {
|
||||
let handshake_enc = framed_recv(stream).await?;
|
||||
let handshake = Handshake::decode(&mut &handshake_enc[..]).map_err(|_| {
|
||||
io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
"execute pvf recv_handshake: failed to decode Handshake".to_owned(),
|
||||
)
|
||||
})?;
|
||||
Ok(handshake)
|
||||
}
|
||||
|
||||
async fn send_request(
|
||||
stream: &mut UnixStream,
|
||||
artifact_path: &Path,
|
||||
@@ -207,29 +191,6 @@ async fn send_request(
|
||||
framed_send(stream, &execution_timeout.encode()).await
|
||||
}
|
||||
|
||||
async fn recv_request(stream: &mut UnixStream) -> io::Result<(PathBuf, Vec<u8>, Duration)> {
|
||||
let artifact_path = framed_recv(stream).await?;
|
||||
let artifact_path = bytes_to_path(&artifact_path).ok_or_else(|| {
|
||||
io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
"execute pvf recv_request: non utf-8 artifact path".to_string(),
|
||||
)
|
||||
})?;
|
||||
let params = framed_recv(stream).await?;
|
||||
let execution_timeout = framed_recv(stream).await?;
|
||||
let execution_timeout = Duration::decode(&mut &execution_timeout[..]).map_err(|_| {
|
||||
io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
"execute pvf recv_request: failed to decode duration".to_string(),
|
||||
)
|
||||
})?;
|
||||
Ok((artifact_path, params, execution_timeout))
|
||||
}
|
||||
|
||||
async fn send_response(stream: &mut UnixStream, response: Response) -> io::Result<()> {
|
||||
framed_send(stream, &response.encode()).await
|
||||
}
|
||||
|
||||
async fn recv_response(stream: &mut UnixStream) -> io::Result<Response> {
|
||||
let response_bytes = framed_recv(stream).await?;
|
||||
Response::decode(&mut &response_bytes[..]).map_err(|e| {
|
||||
@@ -240,28 +201,43 @@ async fn recv_response(stream: &mut UnixStream) -> io::Result<Response> {
|
||||
})
|
||||
}
|
||||
|
||||
/// The payload of the one-time handshake that is done when a worker process is created. Carries
|
||||
/// data from the host to the worker.
|
||||
#[derive(Encode, Decode)]
|
||||
struct Handshake {
|
||||
executor_params: ExecutorParams,
|
||||
pub struct Handshake {
|
||||
/// The executor parameters.
|
||||
pub executor_params: ExecutorParams,
|
||||
}
|
||||
|
||||
/// The response from an execution job on the worker.
|
||||
#[derive(Encode, Decode)]
|
||||
pub enum Response {
|
||||
Ok { result_descriptor: ValidationResult, duration: Duration },
|
||||
/// The job completed successfully.
|
||||
Ok {
|
||||
/// The result of parachain validation.
|
||||
result_descriptor: ValidationResult,
|
||||
/// The amount of CPU time taken by the job.
|
||||
duration: Duration,
|
||||
},
|
||||
/// The candidate is invalid.
|
||||
InvalidCandidate(String),
|
||||
/// The job timed out.
|
||||
TimedOut,
|
||||
/// Some internal error occurred. Should only be used for errors independent of the candidate.
|
||||
InternalError(String),
|
||||
}
|
||||
|
||||
impl Response {
|
||||
fn format_invalid(ctx: &'static str, msg: &str) -> Self {
|
||||
/// Creates an invalid response from a context `ctx` and a message `msg` (which can be empty).
|
||||
pub fn format_invalid(ctx: &'static str, msg: &str) -> Self {
|
||||
if msg.is_empty() {
|
||||
Self::InvalidCandidate(ctx.to_string())
|
||||
} else {
|
||||
Self::InvalidCandidate(format!("{}: {}", ctx, msg))
|
||||
}
|
||||
}
|
||||
fn format_internal(ctx: &'static str, msg: &str) -> Self {
|
||||
/// Creates an internal response from a context `ctx` and a message `msg` (which can be empty).
|
||||
pub fn format_internal(ctx: &'static str, msg: &str) -> Self {
|
||||
if msg.is_empty() {
|
||||
Self::InternalError(ctx.to_string())
|
||||
} else {
|
||||
@@ -269,110 +245,3 @@ impl Response {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The entrypoint that the spawned execute worker should start with. The `socket_path` specifies
|
||||
/// the path to the socket used to communicate with the host. The `node_version`, if `Some`,
|
||||
/// is checked against the worker version. A mismatch results in immediate worker termination.
|
||||
/// `None` is used for tests and in other situations when version check is not necessary.
|
||||
pub fn worker_entrypoint(socket_path: &str, node_version: Option<&str>) {
|
||||
worker_event_loop("execute", socket_path, node_version, |rt_handle, mut stream| async move {
|
||||
let worker_pid = std::process::id();
|
||||
|
||||
let handshake = recv_handshake(&mut stream).await?;
|
||||
let executor = Arc::new(Executor::new(handshake.executor_params).map_err(|e| {
|
||||
io::Error::new(io::ErrorKind::Other, format!("cannot create executor: {}", e))
|
||||
})?);
|
||||
|
||||
loop {
|
||||
let (artifact_path, params, execution_timeout) = recv_request(&mut stream).await?;
|
||||
gum::debug!(
|
||||
target: LOG_TARGET,
|
||||
%worker_pid,
|
||||
"worker: validating artifact {}",
|
||||
artifact_path.display(),
|
||||
);
|
||||
|
||||
// Used to signal to the cpu time monitor thread that it can finish.
|
||||
let (finished_tx, finished_rx) = channel::<()>();
|
||||
let cpu_time_start = ProcessTime::now();
|
||||
|
||||
// Spawn a new thread that runs the CPU time monitor.
|
||||
let cpu_time_monitor_fut = rt_handle
|
||||
.spawn_blocking(move || {
|
||||
cpu_time_monitor_loop(cpu_time_start, execution_timeout, finished_rx)
|
||||
})
|
||||
.fuse();
|
||||
let executor_2 = executor.clone();
|
||||
let execute_fut = rt_handle
|
||||
.spawn_blocking(move || {
|
||||
validate_using_artifact(&artifact_path, ¶ms, executor_2, cpu_time_start)
|
||||
})
|
||||
.fuse();
|
||||
|
||||
pin_mut!(cpu_time_monitor_fut);
|
||||
pin_mut!(execute_fut);
|
||||
|
||||
let response = select_biased! {
|
||||
// If this future is not selected, the join handle is dropped and the thread will
|
||||
// finish in the background.
|
||||
cpu_time_monitor_res = cpu_time_monitor_fut => {
|
||||
match cpu_time_monitor_res {
|
||||
Ok(Some(cpu_time_elapsed)) => {
|
||||
// Log if we exceed the timeout and the other thread hasn't finished.
|
||||
gum::warn!(
|
||||
target: LOG_TARGET,
|
||||
%worker_pid,
|
||||
"execute job took {}ms cpu time, exceeded execute timeout {}ms",
|
||||
cpu_time_elapsed.as_millis(),
|
||||
execution_timeout.as_millis(),
|
||||
);
|
||||
Response::TimedOut
|
||||
},
|
||||
Ok(None) => Response::InternalError("error communicating over finished channel".into()),
|
||||
Err(e) => Response::format_internal("cpu time monitor thread error", &e.to_string()),
|
||||
}
|
||||
},
|
||||
execute_res = execute_fut => {
|
||||
let _ = finished_tx.send(());
|
||||
execute_res.unwrap_or_else(|e| Response::format_internal("execute thread error", &e.to_string()))
|
||||
},
|
||||
};
|
||||
|
||||
send_response(&mut stream, response).await?;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
fn validate_using_artifact(
|
||||
artifact_path: &Path,
|
||||
params: &[u8],
|
||||
executor: Arc<Executor>,
|
||||
cpu_time_start: ProcessTime,
|
||||
) -> Response {
|
||||
// Check here if the file exists, because the error from Substrate is not match-able.
|
||||
// TODO: Re-evaluate after <https://github.com/paritytech/substrate/issues/13860>.
|
||||
let file_metadata = std::fs::metadata(artifact_path);
|
||||
if let Err(err) = file_metadata {
|
||||
return Response::format_internal("execute: could not find or open file", &err.to_string())
|
||||
}
|
||||
|
||||
let descriptor_bytes = match unsafe {
|
||||
// SAFETY: this should be safe since the compiled artifact passed here comes from the
|
||||
// file created by the prepare workers. These files are obtained by calling
|
||||
// [`executor_intf::prepare`].
|
||||
executor.execute(artifact_path.as_ref(), params)
|
||||
} {
|
||||
Err(err) => return Response::format_invalid("execute", &err),
|
||||
Ok(d) => d,
|
||||
};
|
||||
|
||||
let duration = cpu_time_start.elapsed();
|
||||
|
||||
let result_descriptor = match ValidationResult::decode(&mut &descriptor_bytes[..]) {
|
||||
Err(err) =>
|
||||
return Response::format_invalid("validation result decoding failed", &err.to_string()),
|
||||
Ok(r) => r,
|
||||
};
|
||||
|
||||
Response::Ok { result_descriptor, duration }
|
||||
}
|
||||
Reference in New Issue
Block a user