Add logging to PVF and other related parts (#3596)

Co-authored-by: Bastian Köcher <bkchr@users.noreply.github.com>
This commit is contained in:
Sergei Shulepov
2021-08-08 19:39:16 +02:00
committed by GitHub
parent 968132155a
commit 9d6ed7ecae
10 changed files with 211 additions and 58 deletions
+54 -21
View File
@@ -18,6 +18,7 @@
use super::worker::Outcome;
use crate::{
artifacts::{ArtifactId, ArtifactPathId},
host::ResultSender,
worker_common::{IdleWorker, WorkerHandle},
InvalidCandidate, ValidationError, LOG_TARGET,
@@ -36,11 +37,11 @@ slotmap::new_key_type! { struct Worker; }
#[derive(Debug)]
pub enum ToQueue {
Enqueue { artifact_path: PathBuf, params: Vec<u8>, result_tx: ResultSender },
Enqueue { artifact: ArtifactPathId, params: Vec<u8>, result_tx: ResultSender },
}
struct ExecuteJob {
artifact_path: PathBuf,
artifact: ArtifactPathId,
params: Vec<u8>,
result_tx: ResultSender,
}
@@ -87,8 +88,8 @@ impl Workers {
}
enum QueueEvent {
Spawn((IdleWorker, WorkerHandle)),
StartWork(Worker, Outcome, ResultSender),
Spawn(IdleWorker, WorkerHandle),
StartWork(Worker, Outcome, ArtifactId, ResultSender),
}
type Mux = FuturesUnordered<BoxFuture<'static, QueueEvent>>;
@@ -159,9 +160,13 @@ async fn purge_dead(workers: &mut Workers) {
}
fn handle_to_queue(queue: &mut Queue, to_queue: ToQueue) {
let ToQueue::Enqueue { artifact_path, params, result_tx } = to_queue;
let job = ExecuteJob { artifact_path, params, result_tx };
let ToQueue::Enqueue { artifact, params, result_tx } = to_queue;
tracing::debug!(
target: LOG_TARGET,
validation_code_hash = ?artifact.id.code_hash,
"enqueueing an artifact for execution",
);
let job = ExecuteJob { artifact, params, result_tx };
if let Some(available) = queue.workers.find_available() {
assign(queue, available, job);
@@ -175,24 +180,35 @@ fn handle_to_queue(queue: &mut Queue, to_queue: ToQueue) {
async fn handle_mux(queue: &mut Queue, event: QueueEvent) {
match event {
QueueEvent::Spawn((idle, handle)) => {
queue.workers.spawn_inflight -= 1;
let worker = queue.workers.running.insert(WorkerData { idle: Some(idle), handle });
if let Some(job) = queue.queue.pop_front() {
assign(queue, worker, job);
}
QueueEvent::Spawn(idle, handle) => {
handle_worker_spawned(queue, idle, handle);
},
QueueEvent::StartWork(worker, outcome, result_tx) => {
handle_job_finish(queue, worker, outcome, result_tx);
QueueEvent::StartWork(worker, outcome, artifact_id, result_tx) => {
handle_job_finish(queue, worker, outcome, artifact_id, result_tx);
},
}
}
fn handle_worker_spawned(queue: &mut Queue, idle: IdleWorker, handle: WorkerHandle) {
queue.workers.spawn_inflight -= 1;
let worker = queue.workers.running.insert(WorkerData { idle: Some(idle), handle });
tracing::debug!(target: LOG_TARGET, ?worker, "execute worker spawned");
if let Some(job) = queue.queue.pop_front() {
assign(queue, worker, job);
}
}
/// If there are pending jobs in the queue, schedules the next of them onto the just freed up
/// worker. Otherwise, puts back into the available workers list.
fn handle_job_finish(queue: &mut Queue, worker: Worker, outcome: Outcome, result_tx: ResultSender) {
fn handle_job_finish(
queue: &mut Queue,
worker: Worker,
outcome: Outcome,
artifact_id: ArtifactId,
result_tx: ResultSender,
) {
let (idle_worker, result) = match outcome {
Outcome::Ok { result_descriptor, duration_ms, idle_worker } => {
// TODO: propagate the soft timeout
@@ -212,6 +228,14 @@ fn handle_job_finish(queue: &mut Queue, worker: Worker, outcome: Outcome, result
(None, Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbigiousWorkerDeath))),
};
tracing::debug!(
target: LOG_TARGET,
validation_code_hash = ?artifact_id.code_hash,
worker_rip = idle_worker.is_none(),
?result,
"job finished.",
);
// First we send the result. It may fail due the other end of the channel being dropped, that's
// legitimate and we don't treat that as an error.
let _ = result_tx.send(result);
@@ -245,6 +269,8 @@ fn handle_job_finish(queue: &mut Queue, worker: Worker, outcome: Outcome, result
}
fn spawn_extra_worker(queue: &mut Queue) {
tracing::debug!(target: LOG_TARGET, "spawning an extra worker");
queue
.mux
.push(spawn_worker_task(queue.program_path.clone(), queue.spawn_timeout).boxed());
@@ -256,7 +282,7 @@ async fn spawn_worker_task(program_path: PathBuf, spawn_timeout: Duration) -> Qu
loop {
match super::worker::spawn(&program_path, spawn_timeout).await {
Ok((idle, handle)) => break QueueEvent::Spawn((idle, handle)),
Ok((idle, handle)) => break QueueEvent::Spawn(idle, handle),
Err(err) => {
tracing::warn!(target: LOG_TARGET, "failed to spawn an execute worker: {:?}", err);
@@ -271,6 +297,13 @@ async fn spawn_worker_task(program_path: PathBuf, spawn_timeout: Duration) -> Qu
///
/// The worker must be running and idle.
fn assign(queue: &mut Queue, worker: Worker, job: ExecuteJob) {
tracing::debug!(
target: LOG_TARGET,
validation_code_hash = ?job.artifact.id,
?worker,
"assigning the execute worker",
);
let idle = queue.workers.claim_idle(worker).expect(
"this caller must supply a worker which is idle and running;
thus claim_idle cannot return None;
@@ -278,8 +311,8 @@ fn assign(queue: &mut Queue, worker: Worker, job: ExecuteJob) {
);
queue.mux.push(
async move {
let outcome = super::worker::start_work(idle, job.artifact_path, job.params).await;
QueueEvent::StartWork(worker, outcome, job.result_tx)
let outcome = super::worker::start_work(idle, job.artifact.clone(), job.params).await;
QueueEvent::StartWork(worker, outcome, job.artifact.id, job.result_tx)
}
.boxed(),
);
+31 -6
View File
@@ -15,7 +15,7 @@
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
use crate::{
artifacts::Artifact,
artifacts::{Artifact, ArtifactPathId},
executor_intf::TaskExecutor,
worker_common::{
bytes_to_path, framed_recv, framed_send, path_to_bytes, spawn_with_program_path,
@@ -68,7 +68,7 @@ pub enum Outcome {
/// returns the outcome.
pub async fn start_work(
worker: IdleWorker,
artifact_path: PathBuf,
artifact: ArtifactPathId,
validation_params: Vec<u8>,
) -> Outcome {
let IdleWorker { mut stream, pid } = worker;
@@ -76,22 +76,47 @@ pub async fn start_work(
tracing::debug!(
target: LOG_TARGET,
worker_pid = %pid,
validation_code_hash = ?artifact.id.code_hash,
"starting execute for {}",
artifact_path.display(),
artifact.path.display(),
);
if send_request(&mut stream, &artifact_path, &validation_params).await.is_err() {
if let Err(error) = send_request(&mut stream, &artifact.path, &validation_params).await {
tracing::warn!(
target: LOG_TARGET,
worker_pid = %pid,
validation_code_hash = ?artifact.id.code_hash,
?error,
"failed to send an execute request",
);
return Outcome::IoErr
}
let response = futures::select! {
response = recv_response(&mut stream).fuse() => {
match response {
Err(_err) => return Outcome::IoErr,
Err(error) => {
tracing::warn!(
target: LOG_TARGET,
worker_pid = %pid,
validation_code_hash = ?artifact.id.code_hash,
?error,
"failed to recv an execute response",
);
return Outcome::IoErr
},
Ok(response) => response,
}
},
_ = Delay::new(EXECUTION_TIMEOUT).fuse() => return Outcome::HardTimeout,
_ = Delay::new(EXECUTION_TIMEOUT).fuse() => {
tracing::warn!(
target: LOG_TARGET,
worker_pid = %pid,
validation_code_hash = ?artifact.id.code_hash,
"execution worker exceeded alloted time for execution",
);
return Outcome::HardTimeout;
},
};
match response {