PVF: Add Secure Validator Mode (#2486)

Co-authored-by: Javier Viola <javier@parity.io>
This commit is contained in:
Marcin S
2023-12-05 13:32:56 +01:00
committed by GitHub
parent f240e02557
commit c046a9d5ed
31 changed files with 690 additions and 469 deletions
+12 -1
View File
@@ -66,6 +66,7 @@ use polkadot_parachain_primitives::primitives::ValidationCodeHash;
use polkadot_primitives::ExecutorParamsHash;
use std::{
collections::HashMap,
io,
path::{Path, PathBuf},
str::FromStr as _,
time::{Duration, SystemTime},
@@ -290,7 +291,17 @@ impl Artifacts {
}
// Make sure that the cache path directory and all its parents are created.
let _ = tokio::fs::create_dir_all(cache_path).await;
if let Err(err) = tokio::fs::create_dir_all(cache_path).await {
if err.kind() != io::ErrorKind::AlreadyExists {
gum::error!(
target: LOG_TARGET,
?err,
"failed to create dir {:?}",
cache_path,
);
return
}
}
let mut dir = match tokio::fs::read_dir(cache_path).await {
Ok(dir) => dir,
@@ -62,16 +62,16 @@ pub async fn spawn(
security_status,
)
.await?;
send_handshake(&mut idle_worker.stream, Handshake { executor_params })
send_execute_handshake(&mut idle_worker.stream, Handshake { executor_params })
.await
.map_err(|error| {
let err = SpawnErr::Handshake { err: error.to_string() };
gum::warn!(
target: LOG_TARGET,
worker_pid = %idle_worker.pid,
?error,
"failed to send a handshake to the spawned worker",
%err
);
SpawnErr::Handshake
err
})?;
Ok((idle_worker, worker_handle))
}
@@ -286,7 +286,8 @@ where
outcome
}
async fn send_handshake(stream: &mut UnixStream, handshake: Handshake) -> io::Result<()> {
/// Sends a handshake with information specific to the execute worker.
async fn send_execute_handshake(stream: &mut UnixStream, handshake: Handshake) -> io::Result<()> {
framed_send(stream, &handshake.encode()).await
}
+11 -3
View File
@@ -36,7 +36,7 @@ use polkadot_node_core_pvf_common::{
prepare::PrepareSuccess,
pvf::PvfPrepData,
};
use polkadot_node_subsystem::SubsystemResult;
use polkadot_node_subsystem::{SubsystemError, SubsystemResult};
use polkadot_parachain_primitives::primitives::ValidationResult;
use std::{
collections::HashMap,
@@ -156,6 +156,8 @@ pub struct Config {
pub cache_path: PathBuf,
/// The version of the node. `None` can be passed to skip the version check (only for tests).
pub node_version: Option<String>,
/// Whether the node is attempting to run as a secure validator.
pub secure_validator_mode: bool,
/// The path to the program that can be used to spawn the prepare workers.
pub prepare_worker_program_path: PathBuf,
@@ -180,12 +182,14 @@ impl Config {
pub fn new(
cache_path: PathBuf,
node_version: Option<String>,
secure_validator_mode: bool,
prepare_worker_program_path: PathBuf,
execute_worker_program_path: PathBuf,
) -> Self {
Self {
cache_path,
node_version,
secure_validator_mode,
prepare_worker_program_path,
prepare_worker_spawn_timeout: Duration::from_secs(3),
@@ -213,8 +217,12 @@ pub async fn start(
) -> SubsystemResult<(ValidationHost, impl Future<Output = ()>)> {
gum::debug!(target: LOG_TARGET, ?config, "starting PVF validation host");
// Run checks for supported security features once per host startup. Warn here if not enabled.
let security_status = security::check_security_status(&config).await;
// Run checks for supported security features once per host startup. If some checks fail, warn
// if Secure Validator Mode is disabled and return an error otherwise.
let security_status = match security::check_security_status(&config).await {
Ok(ok) => ok,
Err(err) => return Err(SubsystemError::Context(err)),
};
let (to_host_tx, to_host_rx) = mpsc::channel(10);
+148 -49
View File
@@ -18,18 +18,19 @@ use crate::{Config, SecurityStatus, LOG_TARGET};
use futures::join;
use std::{fmt, path::Path};
const SECURE_MODE_ANNOUNCEMENT: &'static str =
"In the next release this will be a hard error by default.
\nMore information: https://wiki.polkadot.network/docs/maintain-guides-secure-validator#secure-validator-mode";
/// Run checks for supported security features.
///
/// # Returns
///
/// Returns the set of security features that we were able to enable. If an error occurs while
/// enabling a security feature we set the corresponding status to `false`.
pub async fn check_security_status(config: &Config) -> SecurityStatus {
let Config { prepare_worker_program_path, cache_path, .. } = config;
///
/// # Errors
///
/// Returns an error only if we could not fully enforce the security level required by the current
/// configuration.
pub async fn check_security_status(config: &Config) -> Result<SecurityStatus, String> {
let Config { prepare_worker_program_path, secure_validator_mode, cache_path, .. } = config;
let (landlock, seccomp, change_root) = join!(
check_landlock(prepare_worker_program_path),
@@ -37,26 +38,81 @@ pub async fn check_security_status(config: &Config) -> SecurityStatus {
check_can_unshare_user_namespace_and_change_root(prepare_worker_program_path, cache_path)
);
let security_status = SecurityStatus {
can_enable_landlock: landlock.is_ok(),
can_enable_seccomp: seccomp.is_ok(),
can_unshare_user_namespace_and_change_root: change_root.is_ok(),
};
let full_security_status =
FullSecurityStatus::new(*secure_validator_mode, landlock, seccomp, change_root);
let security_status = full_security_status.as_partial();
let errs: Vec<SecureModeError> = [landlock, seccomp, change_root]
.into_iter()
.filter_map(|result| result.err())
.collect();
let err_occurred = print_secure_mode_message(errs);
if err_occurred {
gum::error!(
if full_security_status.err_occurred() {
print_secure_mode_error_or_warning(&full_security_status);
if !full_security_status.all_errs_allowed() {
return Err("could not enable Secure Validator Mode; check logs".into())
}
}
if security_status.secure_validator_mode {
gum::info!(
target: LOG_TARGET,
"{}",
SECURE_MODE_ANNOUNCEMENT,
"👮‍♀️ Running in Secure Validator Mode. \
It is highly recommended that you operate according to our security guidelines. \
\nMore information: https://wiki.polkadot.network/docs/maintain-guides-secure-validator#secure-validator-mode"
);
}
security_status
Ok(security_status)
}
/// Contains the full security status including error states.
struct FullSecurityStatus {
partial: SecurityStatus,
errs: Vec<SecureModeError>,
}
impl FullSecurityStatus {
fn new(
secure_validator_mode: bool,
landlock: SecureModeResult,
seccomp: SecureModeResult,
change_root: SecureModeResult,
) -> Self {
Self {
partial: SecurityStatus {
secure_validator_mode,
can_enable_landlock: landlock.is_ok(),
can_enable_seccomp: seccomp.is_ok(),
can_unshare_user_namespace_and_change_root: change_root.is_ok(),
},
errs: [landlock, seccomp, change_root]
.into_iter()
.filter_map(|result| result.err())
.collect(),
}
}
fn as_partial(&self) -> SecurityStatus {
self.partial.clone()
}
fn err_occurred(&self) -> bool {
!self.errs.is_empty()
}
fn all_errs_allowed(&self) -> bool {
!self.partial.secure_validator_mode ||
self.errs.iter().all(|err| err.is_allowed_in_secure_mode(&self.partial))
}
fn errs_string(&self) -> String {
self.errs
.iter()
.map(|err| {
format!(
"\n - {}{}",
if err.is_allowed_in_secure_mode(&self.partial) { "Optional: " } else { "" },
err
)
})
.collect()
}
}
type SecureModeResult = std::result::Result<(), SecureModeError>;
@@ -71,12 +127,17 @@ enum SecureModeError {
impl SecureModeError {
/// Whether this error is allowed with Secure Validator Mode enabled.
fn is_allowed_in_secure_mode(&self) -> bool {
fn is_allowed_in_secure_mode(&self, security_status: &SecurityStatus) -> bool {
use SecureModeError::*;
match self {
CannotEnableLandlock(_) => true,
// Landlock is present on relatively recent Linuxes. This is optional if the unshare
// capability is present, providing FS sandboxing a different way.
CannotEnableLandlock(_) => security_status.can_unshare_user_namespace_and_change_root,
// seccomp should be present on all modern Linuxes unless it's been disabled.
CannotEnableSeccomp(_) => false,
CannotUnshareUserNamespaceAndChangeRoot(_) => false,
// Should always be present on modern Linuxes. If not, Landlock also provides FS
// sandboxing, so don't enforce this.
CannotUnshareUserNamespaceAndChangeRoot(_) => security_status.can_enable_landlock,
}
}
}
@@ -92,12 +153,8 @@ impl fmt::Display for SecureModeError {
}
}
/// Errors if Secure Validator Mode and some mandatory errors occurred, warn otherwise.
///
/// # Returns
///
/// `true` if an error was printed, `false` otherwise.
fn print_secure_mode_message(errs: Vec<SecureModeError>) -> bool {
/// Print an error if Secure Validator Mode and some mandatory errors occurred, warn otherwise.
fn print_secure_mode_error_or_warning(security_status: &FullSecurityStatus) {
// Trying to run securely and some mandatory errors occurred.
const SECURE_MODE_ERROR: &'static str = "🚨 Your system cannot securely run a validator. \
\nRunning validation of malicious PVF code has a higher risk of compromising this machine.";
@@ -105,39 +162,31 @@ fn print_secure_mode_message(errs: Vec<SecureModeError>) -> bool {
// securely.
const SECURE_MODE_WARNING: &'static str = "🚨 Some security issues have been detected. \
\nRunning validation of malicious PVF code has a higher risk of compromising this machine.";
// Message to be printed only when running securely and mandatory errors occurred.
const IGNORE_SECURE_MODE_TIP: &'static str =
"\nYou can ignore this error with the `--insecure-validator-i-know-what-i-do` \
command line argument if you understand and accept the risks of running insecurely. \
With this flag, security features are enabled on a best-effort basis, but not mandatory. \
\nMore information: https://wiki.polkadot.network/docs/maintain-guides-secure-validator#secure-validator-mode";
if errs.is_empty() {
return false
}
let all_errs_allowed = security_status.all_errs_allowed();
let errs_string = security_status.errs_string();
let errs_allowed = errs.iter().all(|err| err.is_allowed_in_secure_mode());
let errs_string: String = errs
.iter()
.map(|err| {
format!(
"\n - {}{}",
if err.is_allowed_in_secure_mode() { "Optional: " } else { "" },
err
)
})
.collect();
if errs_allowed {
if all_errs_allowed {
gum::warn!(
target: LOG_TARGET,
"{}{}",
SECURE_MODE_WARNING,
errs_string,
);
false
} else {
gum::error!(
target: LOG_TARGET,
"{}{}",
"{}{}{}",
SECURE_MODE_ERROR,
errs_string,
IGNORE_SECURE_MODE_TIP
);
true
}
}
@@ -298,3 +347,53 @@ async fn check_seccomp(
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_secure_mode_error_optionality() {
let err = SecureModeError::CannotEnableLandlock(String::new());
assert!(err.is_allowed_in_secure_mode(&SecurityStatus {
secure_validator_mode: true,
can_enable_landlock: false,
can_enable_seccomp: false,
can_unshare_user_namespace_and_change_root: true
}));
assert!(!err.is_allowed_in_secure_mode(&SecurityStatus {
secure_validator_mode: true,
can_enable_landlock: false,
can_enable_seccomp: true,
can_unshare_user_namespace_and_change_root: false
}));
let err = SecureModeError::CannotEnableSeccomp(String::new());
assert!(!err.is_allowed_in_secure_mode(&SecurityStatus {
secure_validator_mode: true,
can_enable_landlock: false,
can_enable_seccomp: false,
can_unshare_user_namespace_and_change_root: true
}));
assert!(!err.is_allowed_in_secure_mode(&SecurityStatus {
secure_validator_mode: true,
can_enable_landlock: false,
can_enable_seccomp: true,
can_unshare_user_namespace_and_change_root: false
}));
let err = SecureModeError::CannotUnshareUserNamespaceAndChangeRoot(String::new());
assert!(err.is_allowed_in_secure_mode(&SecurityStatus {
secure_validator_mode: true,
can_enable_landlock: true,
can_enable_seccomp: false,
can_unshare_user_namespace_and_change_root: false
}));
assert!(!err.is_allowed_in_secure_mode(&SecurityStatus {
secure_validator_mode: true,
can_enable_landlock: false,
can_enable_seccomp: true,
can_unshare_user_namespace_and_change_root: false
}));
}
}
+58 -91
View File
@@ -19,8 +19,9 @@
use crate::LOG_TARGET;
use futures::FutureExt as _;
use futures_timer::Delay;
use parity_scale_codec::Encode;
use pin_project::pin_project;
use polkadot_node_core_pvf_common::SecurityStatus;
use polkadot_node_core_pvf_common::{SecurityStatus, WorkerHandshake};
use rand::Rng;
use std::{
fmt, mem,
@@ -68,83 +69,54 @@ pub async fn spawn_with_program_path(
let program_path = program_path.into();
let worker_dir = WorkerDir::new(debug_id, cache_path).await?;
let extra_args: Vec<String> = extra_args.iter().map(|arg| arg.to_string()).collect();
// Hack the borrow-checker.
let program_path_clone = program_path.clone();
let worker_dir_clone = worker_dir.path().to_owned();
let extra_args_clone = extra_args.clone();
with_transient_socket_path(debug_id, |socket_path| {
let socket_path = socket_path.to_owned();
let worker_dir_path = worker_dir.path().to_owned();
async move {
let listener = UnixListener::bind(&socket_path).map_err(|err| {
gum::warn!(
target: LOG_TARGET,
%debug_id,
?program_path,
?extra_args,
?worker_dir,
?socket_path,
"cannot bind unix socket: {:?}",
err,
);
SpawnErr::Bind
})?;
let listener = match UnixListener::bind(&socket_path) {
Ok(ok) => ok,
Err(err) => return Err(SpawnErr::Bind { socket_path, err: err.to_string() }),
};
let handle = WorkerHandle::spawn(
&program_path,
&extra_args,
&socket_path,
&worker_dir_path,
security_status,
)
.map_err(|err| {
gum::warn!(
target: LOG_TARGET,
%debug_id,
?program_path,
?extra_args,
?worker_dir_path,
?socket_path,
"cannot spawn a worker: {:?}",
err,
);
SpawnErr::ProcessSpawn
})?;
let handle =
WorkerHandle::spawn(&program_path, &extra_args, &socket_path, &worker_dir.path())
.map_err(|err| SpawnErr::ProcessSpawn { program_path, err: err.to_string() })?;
futures::select! {
accept_result = listener.accept().fuse() => {
let (stream, _) = accept_result.map_err(|err| {
gum::warn!(
target: LOG_TARGET,
%debug_id,
?program_path,
?extra_args,
?worker_dir_path,
?socket_path,
"cannot accept a worker: {:?}",
err,
);
SpawnErr::Accept
})?;
let (mut stream, _) = accept_result
.map_err(|err| SpawnErr::Accept { socket_path, err: err.to_string() })?;
send_worker_handshake(&mut stream, WorkerHandshake { security_status })
.await
.map_err(|err| SpawnErr::Handshake { err: err.to_string() })?;
Ok((IdleWorker { stream, pid: handle.id(), worker_dir }, handle))
}
_ = Delay::new(spawn_timeout).fuse() => {
gum::warn!(
target: LOG_TARGET,
%debug_id,
?program_path,
?extra_args,
?worker_dir_path,
?socket_path,
?spawn_timeout,
"spawning and connecting to socket timed out",
);
Err(SpawnErr::AcceptTimeout)
}
_ = Delay::new(spawn_timeout).fuse() => Err(SpawnErr::AcceptTimeout{spawn_timeout}),
}
}
})
.await
.map_err(|err| {
gum::warn!(
target: LOG_TARGET,
%debug_id,
?program_path_clone,
?extra_args_clone,
?worker_dir_clone,
"error spawning worker: {}",
err,
);
err
})
}
/// A temporary, random, free path that is necessary only to establish socket communications. If a
/// directory exists at the path at the end of this function, it is removed then.
async fn with_transient_socket_path<T, F, Fut>(debug_id: &'static str, f: F) -> Result<T, SpawnErr>
where
F: FnOnce(&Path) -> Fut,
@@ -214,21 +186,26 @@ pub struct IdleWorker {
pub worker_dir: WorkerDir,
}
/// This is publicly exposed only for integration tests.
///
/// An error happened during spawning a worker process.
#[derive(Clone, Debug)]
#[derive(thiserror::Error, Clone, Debug)]
#[doc(hidden)]
pub enum SpawnErr {
/// Cannot obtain a temporary path location.
#[error("cannot obtain a temporary path location")]
TmpPath,
/// Cannot bind the socket to the given path.
Bind,
/// An error happened during accepting a connection to the socket.
Accept,
/// An error happened during spawning the process.
ProcessSpawn,
/// The deadline allotted for the worker spawning and connecting to the socket has elapsed.
AcceptTimeout,
/// Failed to send handshake after successful spawning was signaled
Handshake,
#[error("cannot bind the socket to the given path {socket_path:?}: {err}")]
Bind { socket_path: PathBuf, err: String },
#[error(
"an error happened during accepting a connection to the socket {socket_path:?}: {err}"
)]
Accept { socket_path: PathBuf, err: String },
#[error("an error happened during spawning the process at path {program_path:?}: {err}")]
ProcessSpawn { program_path: PathBuf, err: String },
#[error("the deadline {}ms allotted for the worker spawning and connecting to the socket has elapsed", .spawn_timeout.as_millis())]
AcceptTimeout { spawn_timeout: Duration },
#[error("failed to send handshake after successful spawning was signaled: {err}")]
Handshake { err: String },
}
/// This is a representation of a potentially running worker. Drop it and the process will be
@@ -256,22 +233,7 @@ impl WorkerHandle {
extra_args: &[String],
socket_path: impl AsRef<Path>,
worker_dir_path: impl AsRef<Path>,
security_status: SecurityStatus,
) -> io::Result<Self> {
let security_args = {
let mut args = vec![];
if security_status.can_enable_landlock {
args.push("--can-enable-landlock".to_string());
}
if security_status.can_enable_seccomp {
args.push("--can-enable-seccomp".to_string());
}
if security_status.can_unshare_user_namespace_and_change_root {
args.push("--can-unshare-user-namespace-and-change-root".to_string());
}
args
};
// Clear all env vars from the spawned process.
let mut command = process::Command::new(program.as_ref());
command.env_clear();
@@ -286,7 +248,6 @@ impl WorkerHandle {
.arg(socket_path.as_ref().as_os_str())
.arg("--worker-dir-path")
.arg(worker_dir_path.as_ref().as_os_str())
.args(&security_args)
.stdout(std::process::Stdio::piped())
.kill_on_drop(true)
.spawn()?;
@@ -386,6 +347,14 @@ pub async fn framed_recv(r: &mut (impl AsyncRead + Unpin)) -> io::Result<Vec<u8>
Ok(buf)
}
/// Sends a handshake with information for the worker.
async fn send_worker_handshake(
stream: &mut UnixStream,
handshake: WorkerHandshake,
) -> io::Result<()> {
framed_send(stream, &handshake.encode()).await
}
/// A temporary worker dir that contains only files needed by the worker. The worker will change its
/// root (the `/` directory) to this directory; it should have access to no other paths on its
/// filesystem.
@@ -433,8 +402,6 @@ impl WorkerDir {
// Not async since Rust has trouble with async recursion. There should be few files here anyway.
//
// TODO: A lingering malicious job can still access future files in this dir. See
// <https://github.com/paritytech/polkadot-sdk/issues/574> for how to fully secure this.
/// Clear the temporary worker dir without deleting it. Not deleting is important because the worker
/// has mounted its own separate filesystem here.
///