Use clone instead of fork on pvf (#2477)

@mrcnski Done the change on the prepare worker, once the prepare worker
part is good I'll do the same for the execute worker.

This is based on
https://github.com/koute/polkavm/blob/11beebd06276ce9b84f335350138479e714f6caf/crates/polkavm/src/sandbox/linux.rs#L711.

## TODO

- [x] Add a check for this capability at startup
- [x] Add prdoc mentioning the new Secure Validator Mode (optional)
requirement.

## Related

Closes #2162

---------

Co-authored-by: Marcin S <marcin@realemail.net>
This commit is contained in:
jserrat
2024-01-21 11:15:36 +00:00
committed by GitHub
parent caa987d26d
commit 21ef949b6e
20 changed files with 861 additions and 395 deletions
+3 -2
View File
@@ -92,10 +92,11 @@ pub enum JobError {
TimedOut,
#[error("An unexpected panic has occurred in the execution job: {0}")]
Panic(String),
/// Some error occurred when interfacing with the kernel.
#[error("Error interfacing with the kernel: {0}")]
Kernel(String),
#[error("Could not spawn the requested thread: {0}")]
CouldNotSpawnThread(String),
#[error("An error occurred in the CPU time monitor thread: {0}")]
CpuTimeMonitorThread(String),
#[error("Could not set pdeathsig: {0}")]
CouldNotSetPdeathsig(String),
}
@@ -140,7 +140,7 @@ pub unsafe fn create_runtime_from_artifact_bytes(
executor_params: &ExecutorParams,
) -> Result<WasmtimeRuntime, WasmError> {
let mut config = DEFAULT_CONFIG.clone();
config.semantics = params_to_wasmtime_semantics(executor_params);
config.semantics = params_to_wasmtime_semantics(executor_params).0;
sc_executor_wasmtime::create_runtime_from_artifact_bytes::<HostFunctions>(
compiled_artifact_blob,
@@ -148,7 +148,10 @@ pub unsafe fn create_runtime_from_artifact_bytes(
)
}
pub fn params_to_wasmtime_semantics(par: &ExecutorParams) -> Semantics {
/// Takes the default config and overwrites any settings with existing executor parameters.
///
/// Returns the semantics as well as the stack limit (since we are guaranteed to have it).
pub fn params_to_wasmtime_semantics(par: &ExecutorParams) -> (Semantics, DeterministicStackLimit) {
let mut sem = DEFAULT_CONFIG.semantics.clone();
let mut stack_limit = sem
.deterministic_stack_limit
@@ -169,8 +172,8 @@ pub fn params_to_wasmtime_semantics(par: &ExecutorParams) -> Semantics {
ExecutorParam::PvfExecTimeout(_, _) => (), /* Not used here */
}
}
sem.deterministic_stack_limit = Some(stack_limit);
sem
sem.deterministic_stack_limit = Some(stack_limit.clone());
(sem, stack_limit)
}
/// Runs the prevalidation on the given code. Returns a [`RuntimeBlob`] if it succeeds.
@@ -187,7 +190,7 @@ pub fn prepare(
blob: RuntimeBlob,
executor_params: &ExecutorParams,
) -> Result<Vec<u8>, sc_executor_common::error::WasmError> {
let semantics = params_to_wasmtime_semantics(executor_params);
let (semantics, _) = params_to_wasmtime_semantics(executor_params);
sc_executor_wasmtime::prepare_runtime_artifact(blob, &semantics)
}
+2
View File
@@ -57,6 +57,8 @@ pub struct SecurityStatus {
pub can_enable_seccomp: bool,
/// Whether we are able to unshare the user namespace and change the filesystem root.
pub can_unshare_user_namespace_and_change_root: bool,
/// Whether we are able to call `clone` with all sandboxing flags.
pub can_do_secure_clone: bool,
}
/// A handshake with information for the worker.
+128 -32
View File
@@ -18,14 +18,19 @@
pub mod security;
use crate::{framed_recv_blocking, WorkerHandshake, LOG_TARGET};
use crate::{framed_recv_blocking, SecurityStatus, WorkerHandshake, LOG_TARGET};
use cpu_time::ProcessTime;
use futures::never::Never;
use parity_scale_codec::Decode;
use std::{
any::Any,
fmt, io,
os::unix::net::UnixStream,
fmt::{self},
fs::File,
io::{self, Read, Write},
os::{
fd::{AsRawFd, FromRawFd, RawFd},
unix::net::UnixStream,
},
path::PathBuf,
sync::mpsc::{Receiver, RecvTimeoutError},
time::Duration,
@@ -78,7 +83,7 @@ macro_rules! decl_worker_main {
"--check-can-enable-landlock" => {
#[cfg(target_os = "linux")]
let status = if let Err(err) = security::landlock::check_is_fully_enabled() {
let status = if let Err(err) = security::landlock::check_can_fully_enable() {
// Write the error to stderr, log it on the host-side.
eprintln!("{}", err);
-1
@@ -91,7 +96,7 @@ macro_rules! decl_worker_main {
},
"--check-can-enable-seccomp" => {
#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
let status = if let Err(err) = security::seccomp::check_is_fully_enabled() {
let status = if let Err(err) = security::seccomp::check_can_fully_enable() {
// Write the error to stderr, log it on the host-side.
eprintln!("{}", err);
-1
@@ -107,7 +112,7 @@ macro_rules! decl_worker_main {
let cache_path_tempdir = std::path::Path::new(&args[2]);
#[cfg(target_os = "linux")]
let status = if let Err(err) =
security::change_root::check_is_fully_enabled(&cache_path_tempdir)
security::change_root::check_can_fully_enable(&cache_path_tempdir)
{
// Write the error to stderr, log it on the host-side.
eprintln!("{}", err);
@@ -119,6 +124,21 @@ macro_rules! decl_worker_main {
let status = -1;
std::process::exit(status)
},
"--check-can-do-secure-clone" => {
#[cfg(target_os = "linux")]
// SAFETY: new process is spawned within a single threaded process. This
// invariant is enforced by tests.
let status = if let Err(err) = unsafe { security::clone::check_can_fully_clone() } {
// Write the error to stderr, log it on the host-side.
eprintln!("{}", err);
-1
} else {
0
};
#[cfg(not(target_os = "linux"))]
let status = -1;
std::process::exit(status)
},
"test-sleep" => {
std::thread::sleep(std::time::Duration::from_secs(5));
@@ -171,6 +191,84 @@ macro_rules! decl_worker_main {
};
}
//taken from the os_pipe crate. Copied here to reduce one dependency and
// because its type-safe abstractions do not play well with nix's clone
#[cfg(not(target_os = "macos"))]
pub fn pipe2_cloexec() -> io::Result<(libc::c_int, libc::c_int)> {
let mut fds: [libc::c_int; 2] = [0; 2];
let res = unsafe { libc::pipe2(fds.as_mut_ptr(), libc::O_CLOEXEC) };
if res != 0 {
return Err(io::Error::last_os_error())
}
Ok((fds[0], fds[1]))
}
#[cfg(target_os = "macos")]
pub fn pipe2_cloexec() -> io::Result<(libc::c_int, libc::c_int)> {
let mut fds: [libc::c_int; 2] = [0; 2];
let res = unsafe { libc::pipe(fds.as_mut_ptr()) };
if res != 0 {
return Err(io::Error::last_os_error())
}
let res = unsafe { libc::fcntl(fds[0], libc::F_SETFD, libc::FD_CLOEXEC) };
if res != 0 {
return Err(io::Error::last_os_error())
}
let res = unsafe { libc::fcntl(fds[1], libc::F_SETFD, libc::FD_CLOEXEC) };
if res != 0 {
return Err(io::Error::last_os_error())
}
Ok((fds[0], fds[1]))
}
/// A wrapper around a file descriptor used to encapsulate and restrict
/// functionality for pipe operations.
pub struct PipeFd {
file: File,
}
impl AsRawFd for PipeFd {
/// Returns the raw file descriptor associated with this `PipeFd`
fn as_raw_fd(&self) -> RawFd {
self.file.as_raw_fd()
}
}
impl FromRawFd for PipeFd {
/// Creates a new `PipeFd` instance from a raw file descriptor.
///
/// # Safety
///
/// The fd passed in must be an owned file descriptor; in particular, it must be open.
unsafe fn from_raw_fd(fd: RawFd) -> Self {
PipeFd { file: File::from_raw_fd(fd) }
}
}
impl Read for PipeFd {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
self.file.read(buf)
}
fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
self.file.read_to_end(buf)
}
}
impl Write for PipeFd {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
self.file.write(buf)
}
fn flush(&mut self) -> io::Result<()> {
self.file.flush()
}
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
self.file.write_all(buf)
}
}
/// Some allowed overhead that we account for in the "CPU time monitor" thread's sleeps, on the
/// child process.
pub const JOB_TIMEOUT_OVERHEAD: Duration = Duration::from_millis(50);
@@ -192,14 +290,12 @@ impl fmt::Display for WorkerKind {
}
}
// Some fields are only used for logging, and dead-code analysis ignores Debug.
#[allow(dead_code)]
#[derive(Debug)]
pub struct WorkerInfo {
pid: u32,
kind: WorkerKind,
version: Option<String>,
worker_dir_path: PathBuf,
pub pid: u32,
pub kind: WorkerKind,
pub version: Option<String>,
pub worker_dir_path: PathBuf,
}
// NOTE: The worker version must be passed in so that we accurately get the version of the worker,
@@ -218,7 +314,7 @@ pub fn run_worker<F>(
worker_version: Option<&str>,
mut event_loop: F,
) where
F: FnMut(UnixStream, PathBuf) -> io::Result<Never>,
F: FnMut(UnixStream, &WorkerInfo, SecurityStatus) -> io::Result<Never>,
{
#[cfg_attr(not(target_os = "linux"), allow(unused_mut))]
let mut worker_info = WorkerInfo {
@@ -250,11 +346,8 @@ pub fn run_worker<F>(
}
// Make sure that we can read the worker dir path, and log its contents.
let entries = || -> Result<Vec<_>, io::Error> {
std::fs::read_dir(&worker_info.worker_dir_path)?
.map(|res| res.map(|e| e.file_name()))
.collect()
}();
let entries: io::Result<Vec<_>> = std::fs::read_dir(&worker_info.worker_dir_path)
.and_then(|d| d.map(|res| res.map(|e| e.file_name())).collect());
match entries {
Ok(entries) =>
gum::trace!(target: LOG_TARGET, ?worker_info, "content of worker dir: {:?}", entries),
@@ -284,6 +377,22 @@ pub fn run_worker<F>(
{
gum::trace!(target: LOG_TARGET, ?security_status, "Enabling security features");
// First, make sure env vars were cleared, to match the environment we perform the checks
// within. (In theory, running checks with different env vars could result in different
// outcomes of the checks.)
if !security::check_env_vars_were_cleared(&worker_info) {
let err = "not all env vars were cleared when spawning the process";
gum::error!(
target: LOG_TARGET,
?worker_info,
"{}",
err
);
if security_status.secure_validator_mode {
worker_shutdown(worker_info, err);
}
}
// Call based on whether we can change root. Error out if it should work but fails.
//
// NOTE: This should not be called in a multi-threaded context (i.e. inside the tokio
@@ -337,23 +446,10 @@ pub fn run_worker<F>(
}
}
}
if !security::check_env_vars_were_cleared(&worker_info) {
let err = "not all env vars were cleared when spawning the process";
gum::error!(
target: LOG_TARGET,
?worker_info,
"{}",
err
);
if security_status.secure_validator_mode {
worker_shutdown(worker_info, err);
}
}
}
// Run the main worker loop.
let err = event_loop(stream, worker_info.worker_dir_path.clone())
let err = event_loop(stream, &worker_info, security_status)
// It's never `Ok` because it's `Ok(Never)`.
.unwrap_err();
@@ -54,8 +54,7 @@ pub fn enable_for_worker(worker_info: &WorkerInfo) -> Result<()> {
///
/// NOTE: This should not be called in a multi-threaded context. `unshare(2)`:
/// "CLONE_NEWUSER requires that the calling process is not threaded."
#[cfg(target_os = "linux")]
pub fn check_is_fully_enabled(tempdir: &Path) -> Result<()> {
pub fn check_can_fully_enable(tempdir: &Path) -> Result<()> {
let worker_dir_path = tempdir.to_owned();
try_restrict(&WorkerInfo {
pid: std::process::id(),
@@ -69,7 +68,6 @@ pub fn check_is_fully_enabled(tempdir: &Path) -> Result<()> {
///
/// NOTE: This should not be called in a multi-threaded context. `unshare(2)`:
/// "CLONE_NEWUSER requires that the calling process is not threaded."
#[cfg(target_os = "linux")]
fn try_restrict(worker_info: &WorkerInfo) -> Result<()> {
// TODO: Remove this once this is stable: https://github.com/rust-lang/rust/issues/105723
macro_rules! cstr_ptr {
@@ -78,12 +76,6 @@ fn try_restrict(worker_info: &WorkerInfo) -> Result<()> {
};
}
gum::trace!(
target: LOG_TARGET,
?worker_info,
"unsharing the user namespace and calling pivot_root",
);
let worker_dir_path_c = CString::new(worker_info.worker_dir_path.as_os_str().as_bytes())
.expect("on unix; the path will never contain 0 bytes; qed");
@@ -0,0 +1,93 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Polkadot.
// Polkadot is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Polkadot is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
//! Functionality for securing the job processes spawned by the workers using `clone`. If
//! unsupported, falls back to `fork`.
use crate::{worker::WorkerInfo, LOG_TARGET};
use nix::{
errno::Errno,
sched::{CloneCb, CloneFlags},
unistd::Pid,
};
#[derive(thiserror::Error, Debug)]
pub enum Error {
#[error("could not clone, errno: {0}")]
Clone(Errno),
}
pub type Result<T> = std::result::Result<T, Error>;
/// Try to run clone(2) on the current worker.
///
/// SAFETY: new process should be either spawned within a single threaded process, or use only
/// async-signal-safe functions.
pub unsafe fn clone_on_worker(
worker_info: &WorkerInfo,
have_unshare_newuser: bool,
cb: CloneCb,
) -> Result<Pid> {
let flags = clone_flags(have_unshare_newuser);
gum::trace!(
target: LOG_TARGET,
?worker_info,
"calling clone with flags: {:?}",
flags
);
try_clone(cb, flags)
}
/// Runs a check for clone(2) with all sandboxing flags and returns an error indicating whether it
/// can be fully enabled on the current Linux environment.
///
/// SAFETY: new process should be either spawned within a single threaded process, or use only
/// async-signal-safe functions.
pub unsafe fn check_can_fully_clone() -> Result<()> {
try_clone(Box::new(|| 0), clone_flags(false)).map(|_pid| ())
}
/// Runs clone(2) with all sandboxing flags.
///
/// SAFETY: new process should be either spawned within a single threaded process, or use only
/// async-signal-safe functions.
unsafe fn try_clone(cb: CloneCb, flags: CloneFlags) -> Result<Pid> {
let mut stack = [0u8; 2 * 1024 * 1024];
nix::sched::clone(cb, stack.as_mut_slice(), flags, None).map_err(|errno| Error::Clone(errno))
}
/// Returns flags for `clone(2)`, including all the sandbox-related ones.
fn clone_flags(have_unshare_newuser: bool) -> CloneFlags {
// NOTE: CLONE_NEWUSER does not work in `clone` if we previously called `unshare` with this
// flag. On the other hand, if we did not call `unshare` we need this flag for the CAP_SYS_ADMIN
// capability.
let maybe_clone_newuser =
if have_unshare_newuser { CloneFlags::empty() } else { CloneFlags::CLONE_NEWUSER };
// SIGCHLD flag is used to inform clone that the parent process is
// expecting a child termination signal, without this flag `waitpid` function
// return `ECHILD` error.
maybe_clone_newuser |
CloneFlags::CLONE_NEWCGROUP |
CloneFlags::CLONE_NEWIPC |
CloneFlags::CLONE_NEWNET |
CloneFlags::CLONE_NEWNS |
CloneFlags::CLONE_NEWPID |
CloneFlags::CLONE_NEWUTS |
CloneFlags::from_bits_retain(libc::SIGCHLD)
}
@@ -112,7 +112,7 @@ pub fn enable_for_worker(worker_info: &WorkerInfo) -> Result<()> {
// TODO: <https://github.com/landlock-lsm/rust-landlock/issues/36>
/// Runs a check for landlock in its own thread, and returns an error indicating whether the given
/// landlock ABI is fully enabled on the current Linux environment.
pub fn check_is_fully_enabled() -> Result<()> {
pub fn check_can_fully_enable() -> Result<()> {
match std::thread::spawn(|| try_restrict(std::iter::empty::<(PathBuf, AccessFs)>())).join() {
Ok(Ok(())) => Ok(()),
Ok(Err(err)) => Err(err),
@@ -165,7 +165,7 @@ mod tests {
#[test]
fn restricted_thread_cannot_read_file() {
// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
if check_is_fully_enabled().is_err() {
if check_can_fully_enable().is_err() {
return
}
@@ -230,7 +230,7 @@ mod tests {
#[test]
fn restricted_thread_cannot_write_file() {
// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
if check_is_fully_enabled().is_err() {
if check_can_fully_enable().is_err() {
return
}
@@ -289,7 +289,7 @@ mod tests {
#[test]
fn restricted_thread_can_truncate_file() {
// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
if check_is_fully_enabled().is_err() {
if check_can_fully_enable().is_err() {
return
}
@@ -27,15 +27,17 @@
//! - Restrict networking by blocking socket creation and io_uring.
//! - Remove env vars
use crate::{worker::WorkerInfo, LOG_TARGET};
#[cfg(target_os = "linux")]
pub mod change_root;
#[cfg(target_os = "linux")]
pub mod clone;
#[cfg(target_os = "linux")]
pub mod landlock;
#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
pub mod seccomp;
use crate::{worker::WorkerInfo, LOG_TARGET};
/// Require env vars to have been removed when spawning the process, to prevent malicious code from
/// accessing them.
pub fn check_env_vars_were_cleared(worker_info: &WorkerInfo) -> bool {
@@ -110,7 +110,7 @@ pub fn enable_for_worker(worker_info: &WorkerInfo) -> Result<()> {
/// Runs a check for seccomp in its own thread, and returns an error indicating whether seccomp with
/// our rules is fully enabled on the current Linux environment.
pub fn check_is_fully_enabled() -> Result<()> {
pub fn check_can_fully_enable() -> Result<()> {
match std::thread::spawn(|| try_restrict()).join() {
Ok(Ok(())) => Ok(()),
Ok(Err(err)) => Err(err),
@@ -161,7 +161,7 @@ mod tests {
#[test]
fn sandboxed_thread_cannot_use_sockets() {
// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
if check_is_fully_enabled().is_err() {
if check_can_fully_enable().is_err() {
return
}