PVF worker: Add seccomp restrictions (restrict networking) (#2009)

This commit is contained in:
Marcin S
2023-10-31 11:08:08 +01:00
committed by GitHub
parent 2d9426f1cc
commit 9faea380dc
27 changed files with 1376 additions and 714 deletions
+3 -2
View File
@@ -32,10 +32,9 @@ pub use sp_tracing;
const LOG_TARGET: &str = "parachain::pvf-common";
use std::{
io::{Read, Write},
io::{self, Read, Write},
mem,
};
use tokio::io;
#[cfg(feature = "test-utils")]
pub mod tests {
@@ -50,6 +49,8 @@ pub mod tests {
pub struct SecurityStatus {
/// Whether the landlock features we use are fully available on this system.
pub can_enable_landlock: bool,
/// Whether the seccomp features we use are fully available on this system.
pub can_enable_seccomp: bool,
// Whether we are able to unshare the user namespace and change the filesystem root.
pub can_unshare_user_namespace_and_change_root: bool,
}
+38 -19
View File
@@ -23,13 +23,12 @@ use cpu_time::ProcessTime;
use futures::never::Never;
use std::{
any::Any,
fmt,
fmt, io,
os::unix::net::UnixStream,
path::PathBuf,
sync::mpsc::{Receiver, RecvTimeoutError},
time::Duration,
};
use tokio::{io, runtime::Runtime};
/// Use this macro to declare a `fn main() {}` that will create an executable that can be used for
/// spawning the desired worker.
@@ -85,6 +84,13 @@ macro_rules! decl_worker_main {
let status = -1;
std::process::exit(status)
},
"--check-can-enable-seccomp" => {
#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
let status = if security::seccomp::check_is_fully_enabled() { 0 } else { -1 };
#[cfg(not(all(target_os = "linux", target_arch = "x86_64")))]
let status = -1;
std::process::exit(status)
},
"--check-can-unshare-user-namespace-and-change-root" => {
#[cfg(target_os = "linux")]
let status = if let Err(err) = security::unshare_user_namespace_and_change_root(
@@ -129,6 +135,7 @@ macro_rules! decl_worker_main {
let mut worker_dir_path = None;
let mut node_version = None;
let mut can_enable_landlock = false;
let mut can_enable_seccomp = false;
let mut can_unshare_user_namespace_and_change_root = false;
let mut i = 2;
@@ -147,6 +154,7 @@ macro_rules! decl_worker_main {
i += 1
},
"--can-enable-landlock" => can_enable_landlock = true,
"--can-enable-seccomp" => can_enable_seccomp = true,
"--can-unshare-user-namespace-and-change-root" =>
can_unshare_user_namespace_and_change_root = true,
arg => panic!("Unexpected argument found: {}", arg),
@@ -161,6 +169,7 @@ macro_rules! decl_worker_main {
let worker_dir_path = std::path::Path::new(worker_dir_path).to_owned();
let security_status = $crate::SecurityStatus {
can_enable_landlock,
can_enable_seccomp,
can_unshare_user_namespace_and_change_root,
};
@@ -198,7 +207,7 @@ impl fmt::Display for WorkerKind {
// The worker version must be passed in so that we accurately get the version of the worker, and not
// the version that this crate was compiled with.
pub fn worker_event_loop<F, Fut>(
pub fn worker_event_loop<F>(
worker_kind: WorkerKind,
socket_path: PathBuf,
#[cfg_attr(not(target_os = "linux"), allow(unused_mut))] mut worker_dir_path: PathBuf,
@@ -207,8 +216,7 @@ pub fn worker_event_loop<F, Fut>(
#[cfg_attr(not(target_os = "linux"), allow(unused_variables))] security_status: &SecurityStatus,
mut event_loop: F,
) where
F: FnMut(UnixStream, PathBuf) -> Fut,
Fut: futures::Future<Output = io::Result<Never>>,
F: FnMut(UnixStream, PathBuf) -> io::Result<Never>,
{
let worker_pid = std::process::id();
gum::debug!(
@@ -262,7 +270,7 @@ pub fn worker_event_loop<F, Fut>(
}
// Connect to the socket.
let stream = || -> std::io::Result<UnixStream> {
let stream = || -> io::Result<UnixStream> {
let stream = UnixStream::connect(&socket_path)?;
let _ = std::fs::remove_file(&socket_path);
Ok(stream)
@@ -317,6 +325,24 @@ pub fn worker_event_loop<F, Fut>(
let landlock_status =
security::landlock::enable_for_worker(worker_kind, worker_pid, &worker_dir_path);
if !matches!(landlock_status, Ok(landlock::RulesetStatus::FullyEnforced)) {
// We previously were able to enable, so this should never happen.
gum::error!(
target: LOG_TARGET,
%worker_kind,
%worker_pid,
"could not fully enable landlock: {:?}. This should not happen, please report an issue",
landlock_status
);
}
}
// TODO: We can enable the seccomp networking blacklist on aarch64 as well, but we need a CI
// job to catch regressions. See <https://github.com/paritytech/ci_cd/issues/609>.
#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
if security_status.can_enable_seccomp {
let seccomp_status =
security::seccomp::enable_for_worker(worker_kind, worker_pid, &worker_dir_path);
if !matches!(seccomp_status, Ok(())) {
// We previously were able to enable, so this should never happen.
//
// TODO: Make this a real error in secure-mode. See:
@@ -325,8 +351,8 @@ pub fn worker_event_loop<F, Fut>(
target: LOG_TARGET,
%worker_kind,
%worker_pid,
"could not fully enable landlock: {:?}. This should not happen, please report to the Polkadot devs",
landlock_status
"could not fully enable seccomp: {:?}. This should not happen, please report an issue",
seccomp_status
);
}
}
@@ -346,18 +372,11 @@ pub fn worker_event_loop<F, Fut>(
}
// Run the main worker loop.
let rt = Runtime::new().expect("Creates tokio runtime. If this panics the worker will die and the host will detect that and deal with it.");
let err = rt
.block_on(event_loop(stream, worker_dir_path))
let err = event_loop(stream, worker_dir_path)
// It's never `Ok` because it's `Ok(Never)`.
.unwrap_err();
worker_shutdown_message(worker_kind, worker_pid, &err.to_string());
// We don't want tokio to wait for the tasks to finish. We want to bring down the worker as fast
// as possible and not wait for stalled validation to finish. This isn't strictly necessary now,
// but may be in the future.
rt.shutdown_background();
}
/// Provide a consistent message on worker shutdown.
@@ -438,7 +457,7 @@ fn kill_parent_node_in_emergency() {
/// The motivation for this module is to coordinate worker threads without using async Rust.
pub mod thread {
use std::{
panic,
io, panic,
sync::{Arc, Condvar, Mutex},
thread,
time::Duration,
@@ -479,7 +498,7 @@ pub mod thread {
f: F,
cond: Cond,
outcome: WaitOutcome,
) -> std::io::Result<thread::JoinHandle<R>>
) -> io::Result<thread::JoinHandle<R>>
where
F: FnOnce() -> R,
F: Send + 'static + panic::UnwindSafe,
@@ -497,7 +516,7 @@ pub mod thread {
cond: Cond,
outcome: WaitOutcome,
stack_size: usize,
) -> std::io::Result<thread::JoinHandle<R>>
) -> io::Result<thread::JoinHandle<R>>
where
F: FnOnce() -> R,
F: Send + 'static + panic::UnwindSafe,
@@ -1,512 +0,0 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Polkadot.
// Polkadot is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Polkadot is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
//! Functionality for securing workers.
//!
//! This is needed because workers are used to compile and execute untrusted code (PVFs).
//!
//! We currently employ the following security measures:
//!
//! - Restrict filesystem
//! - Use Landlock to remove all unnecessary FS access rights.
//! - Unshare the user and mount namespaces.
//! - Change the root directory to a worker-specific temporary directory.
//! - Remove env vars
use crate::{worker::WorkerKind, LOG_TARGET};
/// Unshare the user namespace and change root to be the artifact directory.
///
/// NOTE: This should not be called in a multi-threaded context. `unshare(2)`:
/// "CLONE_NEWUSER requires that the calling process is not threaded."
#[cfg(target_os = "linux")]
pub fn unshare_user_namespace_and_change_root(
worker_kind: WorkerKind,
worker_pid: u32,
worker_dir_path: &std::path::Path,
) -> Result<(), String> {
use std::{env, ffi::CString, os::unix::ffi::OsStrExt, path::Path, ptr};
// The following was copied from the `cstr_core` crate.
//
// TODO: Remove this once this is stable: https://github.com/rust-lang/rust/issues/105723
#[inline]
#[doc(hidden)]
const fn cstr_is_valid(bytes: &[u8]) -> bool {
if bytes.is_empty() || bytes[bytes.len() - 1] != 0 {
return false
}
let mut index = 0;
while index < bytes.len() - 1 {
if bytes[index] == 0 {
return false
}
index += 1;
}
true
}
macro_rules! cstr {
($e:expr) => {{
const STR: &[u8] = concat!($e, "\0").as_bytes();
const STR_VALID: bool = cstr_is_valid(STR);
let _ = [(); 0 - (!(STR_VALID) as usize)];
#[allow(unused_unsafe)]
unsafe {
core::ffi::CStr::from_bytes_with_nul_unchecked(STR)
}
}}
}
gum::debug!(
target: LOG_TARGET,
%worker_kind,
%worker_pid,
?worker_dir_path,
"unsharing the user namespace and calling pivot_root",
);
let worker_dir_path_c = CString::new(worker_dir_path.as_os_str().as_bytes())
.expect("on unix; the path will never contain 0 bytes; qed");
// Wrapper around all the work to prevent repetitive error handling.
//
// # Errors
//
// It's the caller's responsibility to call `Error::last_os_error`. Note that that alone does
// not give the context of which call failed, so we return a &str error.
|| -> Result<(), &'static str> {
// SAFETY: We pass null-terminated C strings and use the APIs as documented. In fact, steps
// (2) and (3) are adapted from the example in pivot_root(2), with the additional
// change described in the `pivot_root(".", ".")` section.
unsafe {
// 1. `unshare` the user and the mount namespaces.
if libc::unshare(libc::CLONE_NEWUSER | libc::CLONE_NEWNS) < 0 {
return Err("unshare user and mount namespaces")
}
// 2. Setup mounts.
//
// Ensure that new root and its parent mount don't have shared propagation (which would
// cause pivot_root() to return an error), and prevent propagation of mount events to
// the initial mount namespace.
if libc::mount(
ptr::null(),
cstr!("/").as_ptr(),
ptr::null(),
libc::MS_REC | libc::MS_PRIVATE,
ptr::null(),
) < 0
{
return Err("mount MS_PRIVATE")
}
// Ensure that the new root is a mount point.
let additional_flags =
if let WorkerKind::Execute | WorkerKind::CheckPivotRoot = worker_kind {
libc::MS_RDONLY
} else {
0
};
if libc::mount(
worker_dir_path_c.as_ptr(),
worker_dir_path_c.as_ptr(),
ptr::null(), // ignored when MS_BIND is used
libc::MS_BIND |
libc::MS_REC | libc::MS_NOEXEC |
libc::MS_NODEV | libc::MS_NOSUID |
libc::MS_NOATIME | additional_flags,
ptr::null(), // ignored when MS_BIND is used
) < 0
{
return Err("mount MS_BIND")
}
// 3. `pivot_root` to the artifact directory.
if libc::chdir(worker_dir_path_c.as_ptr()) < 0 {
return Err("chdir to worker dir path")
}
if libc::syscall(libc::SYS_pivot_root, cstr!(".").as_ptr(), cstr!(".").as_ptr()) < 0 {
return Err("pivot_root")
}
if libc::umount2(cstr!(".").as_ptr(), libc::MNT_DETACH) < 0 {
return Err("umount the old root mount point")
}
}
Ok(())
}()
.map_err(|err_ctx| {
let err = std::io::Error::last_os_error();
format!("{}: {}", err_ctx, err)
})?;
// Do some assertions.
if env::current_dir().map_err(|err| err.to_string())? != Path::new("/") {
return Err("expected current dir after pivot_root to be `/`".into())
}
env::set_current_dir("..").map_err(|err| err.to_string())?;
if env::current_dir().map_err(|err| err.to_string())? != Path::new("/") {
return Err("expected not to be able to break out of new root by doing `..`".into())
}
Ok(())
}
/// Require env vars to have been removed when spawning the process, to prevent malicious code from
/// accessing them.
pub fn check_env_vars_were_cleared(worker_kind: WorkerKind, worker_pid: u32) -> bool {
let mut ok = true;
for (key, value) in std::env::vars_os() {
// TODO: *theoretically* the value (or mere presence) of `RUST_LOG` can be a source of
// randomness for malicious code. In the future we can remove it also and log in the host;
// see <https://github.com/paritytech/polkadot/issues/7117>.
if key == "RUST_LOG" {
continue
}
// An exception for MacOS. This is not a secure platform anyway, so we let it slide.
#[cfg(target_os = "macos")]
if key == "__CF_USER_TEXT_ENCODING" {
continue
}
gum::error!(
target: LOG_TARGET,
%worker_kind,
%worker_pid,
?key,
?value,
"env var was present that should have been removed",
);
ok = false;
}
ok
}
/// The [landlock] docs say it best:
///
/// > "Landlock is a security feature available since Linux 5.13. The goal is to enable to restrict
/// ambient rights (e.g., global filesystem access) for a set of processes by creating safe security
/// sandboxes as new security layers in addition to the existing system-wide access-controls. This
/// kind of sandbox is expected to help mitigate the security impact of bugs, unexpected or
/// malicious behaviors in applications. Landlock empowers any process, including unprivileged ones,
/// to securely restrict themselves."
///
/// [landlock]: https://docs.rs/landlock/latest/landlock/index.html
#[cfg(target_os = "linux")]
pub mod landlock {
pub use landlock::RulesetStatus;
use crate::{worker::WorkerKind, LOG_TARGET};
use landlock::*;
use std::{
fmt,
path::{Path, PathBuf},
};
/// Landlock ABI version. We use ABI V1 because:
///
/// 1. It is supported by our reference kernel version.
/// 2. Later versions do not (yet) provide additional security that would benefit us.
///
/// # Versions (as of October 2023)
///
/// - Polkadot reference kernel version: 5.16+
///
/// - ABI V1: kernel 5.13 - Introduces landlock, including full restrictions on file reads.
///
/// - ABI V2: kernel 5.19 - Adds ability to prevent file renaming. Does not help us. During
/// execution an attacker can only affect the name of a symlinked artifact and not the
/// original one.
///
/// - ABI V3: kernel 6.2 - Adds ability to prevent file truncation. During execution, can
/// prevent attackers from affecting a symlinked artifact. We don't strictly need this as we
/// plan to check for file integrity anyway; see
/// <https://github.com/paritytech/polkadot-sdk/issues/677>.
///
/// # Determinism
///
/// You may wonder whether we could always use the latest ABI instead of only the ABI supported
/// by the reference kernel version. It seems plausible, since landlock provides a best-effort
/// approach to enabling sandboxing. For example, if the reference version only supported V1 and
/// we were on V2, then landlock would use V2 if it was supported on the current machine, and
/// just fall back to V1 if not.
///
/// The issue with this is indeterminacy. If half of validators were on V2 and half were on V1,
/// they may have different semantics on some PVFs. So a malicious PVF now has a new attack
/// vector: they can exploit this indeterminism between landlock ABIs!
///
/// On the other hand we do want validators to be as secure as possible and protect their keys
/// from attackers. And, the risk with indeterminacy is low and there are other indeterminacy
/// vectors anyway. So we will only upgrade to a new ABI if either the reference kernel version
/// supports it or if it introduces some new feature that is beneficial to security.
pub const LANDLOCK_ABI: ABI = ABI::V1;
#[derive(Debug)]
pub enum TryRestrictError {
InvalidExceptionPath(PathBuf),
RulesetError(RulesetError),
}
impl From<RulesetError> for TryRestrictError {
fn from(err: RulesetError) -> Self {
Self::RulesetError(err)
}
}
impl fmt::Display for TryRestrictError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::InvalidExceptionPath(path) => write!(f, "invalid exception path: {:?}", path),
Self::RulesetError(err) => write!(f, "ruleset error: {}", err.to_string()),
}
}
}
impl std::error::Error for TryRestrictError {}
/// Try to enable landlock for the given kind of worker.
pub fn enable_for_worker(
worker_kind: WorkerKind,
worker_pid: u32,
worker_dir_path: &Path,
) -> Result<RulesetStatus, Box<dyn std::error::Error>> {
let exceptions: Vec<(PathBuf, BitFlags<AccessFs>)> = match worker_kind {
WorkerKind::Prepare => {
vec![(worker_dir_path.to_owned(), AccessFs::WriteFile.into())]
},
WorkerKind::Execute => {
vec![(worker_dir_path.to_owned(), AccessFs::ReadFile.into())]
},
WorkerKind::CheckPivotRoot =>
panic!("this should only be passed for checking pivot_root; qed"),
};
gum::debug!(
target: LOG_TARGET,
%worker_kind,
%worker_pid,
?worker_dir_path,
"enabling landlock with exceptions: {:?}",
exceptions,
);
Ok(try_restrict(exceptions)?)
}
// TODO: <https://github.com/landlock-lsm/rust-landlock/issues/36>
/// Runs a check for landlock and returns a single bool indicating whether the given landlock
/// ABI is fully enabled on the current Linux environment.
pub fn check_is_fully_enabled() -> bool {
let status_from_thread: Result<RulesetStatus, Box<dyn std::error::Error>> =
match std::thread::spawn(|| try_restrict(std::iter::empty::<(PathBuf, AccessFs)>()))
.join()
{
Ok(Ok(status)) => Ok(status),
Ok(Err(ruleset_err)) => Err(ruleset_err.into()),
Err(_err) => Err("a panic occurred in try_restrict".into()),
};
matches!(status_from_thread, Ok(RulesetStatus::FullyEnforced))
}
/// Tries to restrict the current thread (should only be called in a process' main thread) with
/// the following landlock access controls:
///
/// 1. all global filesystem access restricted, with optional exceptions
/// 2. ... more sandbox types (e.g. networking) may be supported in the future.
///
/// If landlock is not supported in the current environment this is simply a noop.
///
/// # Returns
///
/// The status of the restriction (whether it was fully, partially, or not-at-all enforced).
fn try_restrict<I, P, A>(fs_exceptions: I) -> Result<RulesetStatus, TryRestrictError>
where
I: IntoIterator<Item = (P, A)>,
P: AsRef<Path>,
A: Into<BitFlags<AccessFs>>,
{
let mut ruleset =
Ruleset::default().handle_access(AccessFs::from_all(LANDLOCK_ABI))?.create()?;
for (fs_path, access_bits) in fs_exceptions {
let paths = &[fs_path.as_ref().to_owned()];
let mut rules = path_beneath_rules(paths, access_bits).peekable();
if rules.peek().is_none() {
// `path_beneath_rules` silently ignores missing paths, so check for it manually.
return Err(TryRestrictError::InvalidExceptionPath(fs_path.as_ref().to_owned()))
}
ruleset = ruleset.add_rules(rules)?;
}
let status = ruleset.restrict_self()?;
Ok(status.ruleset)
}
#[cfg(test)]
mod tests {
use super::*;
use std::{fs, io::ErrorKind, thread};
#[test]
fn restricted_thread_cannot_read_file() {
// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
if !check_is_fully_enabled() {
return
}
// Restricted thread cannot read from FS.
let handle =
thread::spawn(|| {
// Create, write, and read two tmp files. This should succeed before any
// landlock restrictions are applied.
const TEXT: &str = "foo";
let tmpfile1 = tempfile::NamedTempFile::new().unwrap();
let path1 = tmpfile1.path();
let tmpfile2 = tempfile::NamedTempFile::new().unwrap();
let path2 = tmpfile2.path();
fs::write(path1, TEXT).unwrap();
let s = fs::read_to_string(path1).unwrap();
assert_eq!(s, TEXT);
fs::write(path2, TEXT).unwrap();
let s = fs::read_to_string(path2).unwrap();
assert_eq!(s, TEXT);
// Apply Landlock with a read exception for only one of the files.
let status = try_restrict(vec![(path1, AccessFs::ReadFile)]);
if !matches!(status, Ok(RulesetStatus::FullyEnforced)) {
panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status);
}
// Try to read from both files, only tmpfile1 should succeed.
let result = fs::read_to_string(path1);
assert!(matches!(
result,
Ok(s) if s == TEXT
));
let result = fs::read_to_string(path2);
assert!(matches!(
result,
Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
));
// Apply Landlock for all files.
let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>());
if !matches!(status, Ok(RulesetStatus::FullyEnforced)) {
panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status);
}
// Try to read from tmpfile1 after landlock, it should fail.
let result = fs::read_to_string(path1);
assert!(matches!(
result,
Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
));
});
assert!(handle.join().is_ok());
}
#[test]
fn restricted_thread_cannot_write_file() {
// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
if !check_is_fully_enabled() {
return
}
// Restricted thread cannot write to FS.
let handle =
thread::spawn(|| {
// Create and write two tmp files. This should succeed before any landlock
// restrictions are applied.
const TEXT: &str = "foo";
let tmpfile1 = tempfile::NamedTempFile::new().unwrap();
let path1 = tmpfile1.path();
let tmpfile2 = tempfile::NamedTempFile::new().unwrap();
let path2 = tmpfile2.path();
fs::write(path1, TEXT).unwrap();
fs::write(path2, TEXT).unwrap();
// Apply Landlock with a write exception for only one of the files.
let status = try_restrict(vec![(path1, AccessFs::WriteFile)]);
if !matches!(status, Ok(RulesetStatus::FullyEnforced)) {
panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status);
}
// Try to write to both files, only tmpfile1 should succeed.
let result = fs::write(path1, TEXT);
assert!(matches!(result, Ok(_)));
let result = fs::write(path2, TEXT);
assert!(matches!(
result,
Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
));
// Apply Landlock for all files.
let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>());
if !matches!(status, Ok(RulesetStatus::FullyEnforced)) {
panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status);
}
// Try to write to tmpfile1 after landlock, it should fail.
let result = fs::write(path1, TEXT);
assert!(matches!(
result,
Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
));
});
assert!(handle.join().is_ok());
}
// Test that checks whether landlock under our ABI version is able to truncate files.
#[test]
fn restricted_thread_can_truncate_file() {
// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
if !check_is_fully_enabled() {
return
}
// Restricted thread can truncate file.
let handle =
thread::spawn(|| {
// Create and write a file. This should succeed before any landlock
// restrictions are applied.
const TEXT: &str = "foo";
let tmpfile = tempfile::NamedTempFile::new().unwrap();
let path = tmpfile.path();
fs::write(path, TEXT).unwrap();
// Apply Landlock with all exceptions under the current ABI.
let status = try_restrict(vec![(path, AccessFs::from_all(LANDLOCK_ABI))]);
if !matches!(status, Ok(RulesetStatus::FullyEnforced)) {
panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status);
}
// Try to truncate the file.
let result = tmpfile.as_file().set_len(0);
assert!(result.is_ok());
});
assert!(handle.join().is_ok());
}
}
}
@@ -0,0 +1,325 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Polkadot.
// Polkadot is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Polkadot is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
//! The [landlock] docs say it best:
//!
//! > "Landlock is a security feature available since Linux 5.13. The goal is to enable to restrict
//! ambient rights (e.g., global filesystem access) for a set of processes by creating safe security
//! sandboxes as new security layers in addition to the existing system-wide access-controls. This
//! kind of sandbox is expected to help mitigate the security impact of bugs, unexpected or
//! malicious behaviors in applications. Landlock empowers any process, including unprivileged ones,
//! to securely restrict themselves."
//!
//! [landlock]: https://docs.rs/landlock/latest/landlock/index.html
pub use landlock::RulesetStatus;
use crate::{
worker::{stringify_panic_payload, WorkerKind},
LOG_TARGET,
};
use landlock::*;
use std::path::{Path, PathBuf};
/// Landlock ABI version. We use ABI V1 because:
///
/// 1. It is supported by our reference kernel version.
/// 2. Later versions do not (yet) provide additional security that would benefit us.
///
/// # Versions (as of October 2023)
///
/// - Polkadot reference kernel version: 5.16+
///
/// - ABI V1: kernel 5.13 - Introduces landlock, including full restrictions on file reads.
///
/// - ABI V2: kernel 5.19 - Adds ability to prevent file renaming. Does not help us. During
/// execution an attacker can only affect the name of a symlinked artifact and not the original
/// one.
///
/// - ABI V3: kernel 6.2 - Adds ability to prevent file truncation. During execution, can
/// prevent attackers from affecting a symlinked artifact. We don't strictly need this as we
/// plan to check for file integrity anyway; see
/// <https://github.com/paritytech/polkadot-sdk/issues/677>.
///
/// # Determinism
///
/// You may wonder whether we could always use the latest ABI instead of only the ABI supported
/// by the reference kernel version. It seems plausible, since landlock provides a best-effort
/// approach to enabling sandboxing. For example, if the reference version only supported V1 and
/// we were on V2, then landlock would use V2 if it was supported on the current machine, and
/// just fall back to V1 if not.
///
/// The issue with this is indeterminacy. If half of validators were on V2 and half were on V1,
/// they may have different semantics on some PVFs. So a malicious PVF now has a new attack
/// vector: they can exploit this indeterminism between landlock ABIs!
///
/// On the other hand we do want validators to be as secure as possible and protect their keys
/// from attackers. And, the risk with indeterminacy is low and there are other indeterminacy
/// vectors anyway. So we will only upgrade to a new ABI if either the reference kernel version
/// supports it or if it introduces some new feature that is beneficial to security.
pub const LANDLOCK_ABI: ABI = ABI::V1;
#[derive(thiserror::Error, Debug)]
pub enum Error {
#[error("Invalid exception path: {0:?}")]
InvalidExceptionPath(PathBuf),
#[error(transparent)]
RulesetError(#[from] RulesetError),
#[error("A panic occurred in try_restrict: {0}")]
Panic(String),
}
pub type Result<T> = std::result::Result<T, Error>;
/// Try to enable landlock for the given kind of worker.
pub fn enable_for_worker(
worker_kind: WorkerKind,
worker_pid: u32,
worker_dir_path: &Path,
) -> Result<RulesetStatus> {
let exceptions: Vec<(PathBuf, BitFlags<AccessFs>)> = match worker_kind {
WorkerKind::Prepare => {
vec![(worker_dir_path.to_owned(), AccessFs::WriteFile.into())]
},
WorkerKind::Execute => {
vec![(worker_dir_path.to_owned(), AccessFs::ReadFile.into())]
},
WorkerKind::CheckPivotRoot =>
panic!("this should only be passed for checking pivot_root; qed"),
};
gum::trace!(
target: LOG_TARGET,
%worker_kind,
%worker_pid,
?worker_dir_path,
"enabling landlock with exceptions: {:?}",
exceptions,
);
try_restrict(exceptions)
}
// TODO: <https://github.com/landlock-lsm/rust-landlock/issues/36>
/// Runs a check for landlock and returns a single bool indicating whether the given landlock
/// ABI is fully enabled on the current Linux environment.
pub fn check_is_fully_enabled() -> bool {
let status_from_thread: Result<RulesetStatus> =
match std::thread::spawn(|| try_restrict(std::iter::empty::<(PathBuf, AccessFs)>())).join()
{
Ok(Ok(status)) => Ok(status),
Ok(Err(ruleset_err)) => Err(ruleset_err.into()),
Err(err) => Err(Error::Panic(stringify_panic_payload(err))),
};
matches!(status_from_thread, Ok(RulesetStatus::FullyEnforced))
}
/// Tries to restrict the current thread (should only be called in a process' main thread) with
/// the following landlock access controls:
///
/// 1. all global filesystem access restricted, with optional exceptions
/// 2. ... more sandbox types (e.g. networking) may be supported in the future.
///
/// If landlock is not supported in the current environment this is simply a noop.
///
/// # Returns
///
/// The status of the restriction (whether it was fully, partially, or not-at-all enforced).
fn try_restrict<I, P, A>(fs_exceptions: I) -> Result<RulesetStatus>
where
I: IntoIterator<Item = (P, A)>,
P: AsRef<Path>,
A: Into<BitFlags<AccessFs>>,
{
let mut ruleset =
Ruleset::default().handle_access(AccessFs::from_all(LANDLOCK_ABI))?.create()?;
for (fs_path, access_bits) in fs_exceptions {
let paths = &[fs_path.as_ref().to_owned()];
let mut rules = path_beneath_rules(paths, access_bits).peekable();
if rules.peek().is_none() {
// `path_beneath_rules` silently ignores missing paths, so check for it manually.
return Err(Error::InvalidExceptionPath(fs_path.as_ref().to_owned()))
}
ruleset = ruleset.add_rules(rules)?;
}
let status = ruleset.restrict_self()?;
Ok(status.ruleset)
}
#[cfg(test)]
mod tests {
use super::*;
use std::{fs, io::ErrorKind, thread};
#[test]
fn restricted_thread_cannot_read_file() {
// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
if !check_is_fully_enabled() {
return
}
// Restricted thread cannot read from FS.
let handle = thread::spawn(|| {
// Create, write, and read two tmp files. This should succeed before any
// landlock restrictions are applied.
const TEXT: &str = "foo";
let tmpfile1 = tempfile::NamedTempFile::new().unwrap();
let path1 = tmpfile1.path();
let tmpfile2 = tempfile::NamedTempFile::new().unwrap();
let path2 = tmpfile2.path();
fs::write(path1, TEXT).unwrap();
let s = fs::read_to_string(path1).unwrap();
assert_eq!(s, TEXT);
fs::write(path2, TEXT).unwrap();
let s = fs::read_to_string(path2).unwrap();
assert_eq!(s, TEXT);
// Apply Landlock with a read exception for only one of the files.
let status = try_restrict(vec![(path1, AccessFs::ReadFile)]);
if !matches!(status, Ok(RulesetStatus::FullyEnforced)) {
panic!(
"Ruleset should be enforced since we checked if landlock is enabled: {:?}",
status
);
}
// Try to read from both files, only tmpfile1 should succeed.
let result = fs::read_to_string(path1);
assert!(matches!(
result,
Ok(s) if s == TEXT
));
let result = fs::read_to_string(path2);
assert!(matches!(
result,
Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
));
// Apply Landlock for all files.
let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>());
if !matches!(status, Ok(RulesetStatus::FullyEnforced)) {
panic!(
"Ruleset should be enforced since we checked if landlock is enabled: {:?}",
status
);
}
// Try to read from tmpfile1 after landlock, it should fail.
let result = fs::read_to_string(path1);
assert!(matches!(
result,
Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
));
});
assert!(handle.join().is_ok());
}
#[test]
fn restricted_thread_cannot_write_file() {
// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
if !check_is_fully_enabled() {
return
}
// Restricted thread cannot write to FS.
let handle = thread::spawn(|| {
// Create and write two tmp files. This should succeed before any landlock
// restrictions are applied.
const TEXT: &str = "foo";
let tmpfile1 = tempfile::NamedTempFile::new().unwrap();
let path1 = tmpfile1.path();
let tmpfile2 = tempfile::NamedTempFile::new().unwrap();
let path2 = tmpfile2.path();
fs::write(path1, TEXT).unwrap();
fs::write(path2, TEXT).unwrap();
// Apply Landlock with a write exception for only one of the files.
let status = try_restrict(vec![(path1, AccessFs::WriteFile)]);
if !matches!(status, Ok(RulesetStatus::FullyEnforced)) {
panic!(
"Ruleset should be enforced since we checked if landlock is enabled: {:?}",
status
);
}
// Try to write to both files, only tmpfile1 should succeed.
let result = fs::write(path1, TEXT);
assert!(matches!(result, Ok(_)));
let result = fs::write(path2, TEXT);
assert!(matches!(
result,
Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
));
// Apply Landlock for all files.
let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>());
if !matches!(status, Ok(RulesetStatus::FullyEnforced)) {
panic!(
"Ruleset should be enforced since we checked if landlock is enabled: {:?}",
status
);
}
// Try to write to tmpfile1 after landlock, it should fail.
let result = fs::write(path1, TEXT);
assert!(matches!(
result,
Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
));
});
assert!(handle.join().is_ok());
}
// Test that checks whether landlock under our ABI version is able to truncate files.
#[test]
fn restricted_thread_can_truncate_file() {
// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
if !check_is_fully_enabled() {
return
}
// Restricted thread can truncate file.
let handle = thread::spawn(|| {
// Create and write a file. This should succeed before any landlock
// restrictions are applied.
const TEXT: &str = "foo";
let tmpfile = tempfile::NamedTempFile::new().unwrap();
let path = tmpfile.path();
fs::write(path, TEXT).unwrap();
// Apply Landlock with all exceptions under the current ABI.
let status = try_restrict(vec![(path, AccessFs::from_all(LANDLOCK_ABI))]);
if !matches!(status, Ok(RulesetStatus::FullyEnforced)) {
panic!(
"Ruleset should be enforced since we checked if landlock is enabled: {:?}",
status
);
}
// Try to truncate the file.
let result = tmpfile.as_file().set_len(0);
assert!(result.is_ok());
});
assert!(handle.join().is_ok());
}
}
@@ -0,0 +1,189 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Polkadot.
// Polkadot is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Polkadot is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
//! Functionality for securing workers.
//!
//! This is needed because workers are used to compile and execute untrusted code (PVFs).
//!
//! We currently employ the following security measures:
//!
//! - Restrict filesystem
//! - Use Landlock to remove all unnecessary FS access rights.
//! - Unshare the user and mount namespaces.
//! - Change the root directory to a worker-specific temporary directory.
//! - Restrict networking by blocking socket creation and io_uring.
//! - Remove env vars
use crate::{worker::WorkerKind, LOG_TARGET};
#[cfg(target_os = "linux")]
pub mod landlock;
#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
pub mod seccomp;
/// Unshare the user namespace and change root to be the artifact directory.
///
/// NOTE: This should not be called in a multi-threaded context. `unshare(2)`:
/// "CLONE_NEWUSER requires that the calling process is not threaded."
#[cfg(target_os = "linux")]
pub fn unshare_user_namespace_and_change_root(
worker_kind: WorkerKind,
worker_pid: u32,
worker_dir_path: &std::path::Path,
) -> Result<(), String> {
use std::{env, ffi::CString, os::unix::ffi::OsStrExt, path::Path, ptr};
// TODO: Remove this once this is stable: https://github.com/rust-lang/rust/issues/105723
macro_rules! cstr_ptr {
($e:expr) => {
concat!($e, "\0").as_ptr().cast::<core::ffi::c_char>()
};
}
gum::trace!(
target: LOG_TARGET,
%worker_kind,
%worker_pid,
?worker_dir_path,
"unsharing the user namespace and calling pivot_root",
);
let worker_dir_path_c = CString::new(worker_dir_path.as_os_str().as_bytes())
.expect("on unix; the path will never contain 0 bytes; qed");
// Wrapper around all the work to prevent repetitive error handling.
//
// # Errors
//
// It's the caller's responsibility to call `Error::last_os_error`. Note that that alone does
// not give the context of which call failed, so we return a &str error.
|| -> Result<(), &'static str> {
// SAFETY: We pass null-terminated C strings and use the APIs as documented. In fact, steps
// (2) and (3) are adapted from the example in pivot_root(2), with the additional
// change described in the `pivot_root(".", ".")` section.
unsafe {
// 1. `unshare` the user and the mount namespaces.
if libc::unshare(libc::CLONE_NEWUSER | libc::CLONE_NEWNS) < 0 {
return Err("unshare user and mount namespaces")
}
// 2. Setup mounts.
//
// Ensure that new root and its parent mount don't have shared propagation (which would
// cause pivot_root() to return an error), and prevent propagation of mount events to
// the initial mount namespace.
if libc::mount(
ptr::null(),
cstr_ptr!("/"),
ptr::null(),
libc::MS_REC | libc::MS_PRIVATE,
ptr::null(),
) < 0
{
return Err("mount MS_PRIVATE")
}
// Ensure that the new root is a mount point.
let additional_flags =
if let WorkerKind::Execute | WorkerKind::CheckPivotRoot = worker_kind {
libc::MS_RDONLY
} else {
0
};
if libc::mount(
worker_dir_path_c.as_ptr(),
worker_dir_path_c.as_ptr(),
ptr::null(), // ignored when MS_BIND is used
libc::MS_BIND |
libc::MS_REC | libc::MS_NOEXEC |
libc::MS_NODEV | libc::MS_NOSUID |
libc::MS_NOATIME | additional_flags,
ptr::null(), // ignored when MS_BIND is used
) < 0
{
return Err("mount MS_BIND")
}
// 3. `pivot_root` to the artifact directory.
if libc::chdir(worker_dir_path_c.as_ptr()) < 0 {
return Err("chdir to worker dir path")
}
if libc::syscall(libc::SYS_pivot_root, cstr_ptr!("."), cstr_ptr!(".")) < 0 {
return Err("pivot_root")
}
if libc::umount2(cstr_ptr!("."), libc::MNT_DETACH) < 0 {
return Err("umount the old root mount point")
}
}
Ok(())
}()
.map_err(|err_ctx| {
let err = std::io::Error::last_os_error();
format!("{}: {}", err_ctx, err)
})?;
// Do some assertions.
if env::current_dir().map_err(|err| err.to_string())? != Path::new("/") {
return Err("expected current dir after pivot_root to be `/`".into())
}
env::set_current_dir("..").map_err(|err| err.to_string())?;
if env::current_dir().map_err(|err| err.to_string())? != Path::new("/") {
return Err("expected not to be able to break out of new root by doing `..`".into())
}
Ok(())
}
/// Require env vars to have been removed when spawning the process, to prevent malicious code from
/// accessing them.
pub fn check_env_vars_were_cleared(worker_kind: WorkerKind, worker_pid: u32) -> bool {
gum::trace!(
target: LOG_TARGET,
%worker_kind,
%worker_pid,
"clearing env vars in worker",
);
let mut ok = true;
for (key, value) in std::env::vars_os() {
// TODO: *theoretically* the value (or mere presence) of `RUST_LOG` can be a source of
// randomness for malicious code. In the future we can remove it also and log in the host;
// see <https://github.com/paritytech/polkadot/issues/7117>.
if key == "RUST_LOG" {
continue
}
// An exception for MacOS. This is not a secure platform anyway, so we let it slide.
#[cfg(target_os = "macos")]
if key == "__CF_USER_TEXT_ENCODING" {
continue
}
gum::error!(
target: LOG_TARGET,
%worker_kind,
%worker_pid,
?key,
?value,
"env var was present that should have been removed",
);
ok = false;
}
ok
}
@@ -0,0 +1,201 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Polkadot.
// Polkadot is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Polkadot is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
//! Functionality for sandboxing workers by restricting their capabilities by blocking certain
//! syscalls with seccomp.
//!
//! For security we block the following:
//!
//! - creation of new sockets - these are unneeded in PVF jobs, and we can safely block them without
//! affecting consensus.
//!
//! - `io_uring` - allows for networking and needs to be blocked. See below for a discussion on the
//! safety of doing this.
//!
//! # Safety of blocking io_uring
//!
//! `io_uring` is just a way of issuing system calls in an async manner, and there is nothing
//! stopping wasmtime from legitimately using it. Fortunately, at the moment it does not. Generally,
//! not many applications use `io_uring` in production yet, because of the numerous kernel CVEs
//! discovered. It's still under a lot of development. Android outright banned `io_uring` for these
//! reasons.
//!
//! Considering `io_uring`'s status discussed above, and that it very likely would get detected
//! either by our [static analysis](https://github.com/paritytech/polkadot-sdk/pull/1663) or by
//! testing, we think it is safe to block it.
//!
//! ## Consensus analysis
//!
//! If execution hits an edge case code path unique to a given machine, it's already taken a
//! non-deterministic branch anyway. After all, we just care that the majority of validators reach
//! the same result and preserve consensus. So worst-case scenario, there's a dispute, and we can
//! always admit fault and refund the wrong validator. On the other hand, if all validators take the
//! code path that results in a seccomp violation, then they would all vote against the current
//! candidate, which is also fine. The violation would get logged (in big scary letters) and
//! hopefully some validator reports it to us.
//!
//! Actually, a worst-worse-case scenario is that 50% of validators vote against, so that there is
//! no consensus. But so many things would have to go wrong for that to happen:
//!
//! 1. An update to `wasmtime` is introduced that uses io_uring (unlikely as io_uring is mainly for
//! IO-heavy applications)
//!
//! 2. The new syscall is not detected by our static analysis
//!
//! 3. It is never triggered in any of our tests
//!
//! 4. It then gets triggered on some super edge case in production on 50% of validators causing a
//! stall (bad but very unlikely)
//!
//! 5. Or, it triggers on only a few validators causing a dispute (more likely but not as bad)
//!
//! Considering how many things would have to go wrong here, we believe it's safe to block
//! `io_uring`.
//!
//! # Action on syscall violations
//!
//! On syscall violations we currently only log, to make sure this works correctly before enforcing.
//!
//! In the future, when a forbidden syscall is attempted we immediately kill the process in order to
//! prevent the attacker from doing anything else. In execution, this will result in voting against
//! the candidate.
use crate::{
worker::{stringify_panic_payload, WorkerKind},
LOG_TARGET,
};
use seccompiler::*;
use std::{collections::BTreeMap, path::Path};
/// The action to take on caught syscalls.
#[cfg(not(test))]
const CAUGHT_ACTION: SeccompAction = SeccompAction::Log;
/// Don't kill the process when testing.
#[cfg(test)]
const CAUGHT_ACTION: SeccompAction = SeccompAction::Errno(libc::EACCES as u32);
#[derive(thiserror::Error, Debug)]
pub enum Error {
#[error(transparent)]
Seccomp(#[from] seccompiler::Error),
#[error(transparent)]
Backend(#[from] seccompiler::BackendError),
#[error("A panic occurred in try_restrict: {0}")]
Panic(String),
}
pub type Result<T> = std::result::Result<T, Error>;
/// Try to enable seccomp for the given kind of worker.
pub fn enable_for_worker(
worker_kind: WorkerKind,
worker_pid: u32,
worker_dir_path: &Path,
) -> Result<()> {
gum::trace!(
target: LOG_TARGET,
%worker_kind,
%worker_pid,
?worker_dir_path,
"enabling seccomp",
);
try_restrict()
}
/// Runs a check for seccomp and returns a single bool indicating whether seccomp with our rules is
/// fully enabled on the current Linux environment.
pub fn check_is_fully_enabled() -> bool {
let status_from_thread: Result<()> = match std::thread::spawn(|| try_restrict()).join() {
Ok(Ok(())) => Ok(()),
Ok(Err(err)) => Err(err.into()),
Err(err) => Err(Error::Panic(stringify_panic_payload(err))),
};
matches!(status_from_thread, Ok(()))
}
/// Applies a `seccomp` filter to disable networking for the PVF threads.
pub fn try_restrict() -> Result<()> {
// Build a `seccomp` filter which by default allows all syscalls except those blocked in the
// blacklist.
let mut blacklisted_rules = BTreeMap::default();
// Restrict the creation of sockets.
blacklisted_rules.insert(libc::SYS_socketpair, vec![]);
blacklisted_rules.insert(libc::SYS_socket, vec![]);
// Prevent connecting to sockets for extra safety.
blacklisted_rules.insert(libc::SYS_connect, vec![]);
// Restrict io_uring.
blacklisted_rules.insert(libc::SYS_io_uring_setup, vec![]);
blacklisted_rules.insert(libc::SYS_io_uring_enter, vec![]);
blacklisted_rules.insert(libc::SYS_io_uring_register, vec![]);
let filter = SeccompFilter::new(
blacklisted_rules,
// Mismatch action: what to do if not in rule list.
SeccompAction::Allow,
// Match action: what to do if in rule list.
CAUGHT_ACTION,
TargetArch::x86_64,
)?;
let bpf_prog: BpfProgram = filter.try_into()?;
// Applies filter (runs seccomp) to the calling thread.
seccompiler::apply_filter(&bpf_prog)?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use std::{io::ErrorKind, net::TcpListener, thread};
#[test]
fn sandboxed_thread_cannot_use_sockets() {
// TODO: This would be nice: <https://github.com/rust-lang/rust/issues/68007>.
if !check_is_fully_enabled() {
return
}
let handle = thread::spawn(|| {
// Open a socket, this should succeed before seccomp is applied.
TcpListener::bind("127.0.0.1:0").unwrap();
let status = try_restrict();
if !matches!(status, Ok(())) {
panic!("Ruleset should be enforced since we checked if seccomp is enabled");
}
// Try to open a socket after seccomp.
assert!(matches!(
TcpListener::bind("127.0.0.1:0"),
Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied)
));
// Other syscalls should still work.
unsafe {
assert!(libc::getppid() > 0);
}
});
assert!(handle.join().is_ok());
}
}