PVF worker: Add seccomp restrictions (restrict networking) (#2009)

This commit is contained in:
Marcin S
2023-10-31 11:08:08 +01:00
committed by GitHub
parent 2d9426f1cc
commit 9faea380dc
27 changed files with 1376 additions and 714 deletions
@@ -18,6 +18,7 @@
use crate::{
artifacts::ArtifactPathId,
security,
worker_intf::{
clear_worker_dir_path, framed_recv, framed_send, spawn_with_program_path, IdleWorker,
SpawnErr, WorkerDir, WorkerHandle, JOB_TIMEOUT_WALL_CLOCK_FACTOR,
@@ -106,7 +107,7 @@ pub enum Outcome {
/// returns the outcome.
///
/// NOTE: Not returning the idle worker token in `Outcome` will trigger the child process being
/// killed.
/// killed, if it's still alive.
pub async fn start_work(
worker: IdleWorker,
artifact: ArtifactPathId,
@@ -124,7 +125,10 @@ pub async fn start_work(
artifact.path.display(),
);
let artifact_path = artifact.path.clone();
with_worker_dir_setup(worker_dir, pid, &artifact.path, |worker_dir| async move {
let audit_log_file = security::AuditLogFile::try_open_and_seek_to_end().await;
if let Err(error) = send_request(&mut stream, &validation_params, execution_timeout).await {
gum::warn!(
target: LOG_TARGET,
@@ -153,9 +157,38 @@ pub async fn start_work(
?error,
"failed to recv an execute response",
);
// The worker died. Check if it was due to a seccomp violation.
//
// NOTE: Log, but don't change the outcome. Not all validators may have
// auditing enabled, so we don't want attackers to abuse a non-deterministic
// outcome.
for syscall in security::check_seccomp_violations_for_worker(audit_log_file, pid).await {
gum::error!(
target: LOG_TARGET,
worker_pid = %pid,
%syscall,
validation_code_hash = ?artifact.id.code_hash,
?artifact_path,
"A forbidden syscall was attempted! This is a violation of our seccomp security policy. Report an issue ASAP!"
);
}
return Outcome::IoErr
},
Ok(response) => {
// Check if any syscall violations occurred during the job. For now this is
// only informative, as we are not enforcing the seccomp policy yet.
for syscall in security::check_seccomp_violations_for_worker(audit_log_file, pid).await {
gum::error!(
target: LOG_TARGET,
worker_pid = %pid,
%syscall,
validation_code_hash = ?artifact.id.code_hash,
?artifact_path,
"A forbidden syscall was attempted! This is a violation of our seccomp security policy. Report an issue ASAP!"
);
}
if let Response::Ok{duration, ..} = response {
if duration > execution_timeout {
// The job didn't complete within the timeout.
+20 -106
View File
@@ -24,12 +24,12 @@ use crate::{
artifacts::{ArtifactId, ArtifactPathId, ArtifactState, Artifacts},
execute::{self, PendingExecutionRequest},
metrics::Metrics,
prepare, Priority, ValidationError, LOG_TARGET,
prepare, security, Priority, ValidationError, LOG_TARGET,
};
use always_assert::never;
use futures::{
channel::{mpsc, oneshot},
Future, FutureExt, SinkExt, StreamExt,
join, Future, FutureExt, SinkExt, StreamExt,
};
use polkadot_node_core_pvf_common::{
error::{PrepareError, PrepareResult},
@@ -153,6 +153,7 @@ pub struct Config {
pub cache_path: PathBuf,
/// The version of the node. `None` can be passed to skip the version check (only for tests).
pub node_version: Option<String>,
/// The path to the program that can be used to spawn the prepare workers.
pub prepare_worker_program_path: PathBuf,
/// The time allotted for a prepare worker to spawn and report to the host.
@@ -162,6 +163,7 @@ pub struct Config {
pub prepare_workers_soft_max_num: usize,
/// The absolute number of workers that can be spawned in the prepare pool.
pub prepare_workers_hard_max_num: usize,
/// The path to the program that can be used to spawn the execute workers.
pub execute_worker_program_path: PathBuf,
/// The time allotted for an execute worker to spawn and report to the host.
@@ -181,10 +183,12 @@ impl Config {
Self {
cache_path,
node_version,
prepare_worker_program_path,
prepare_worker_spawn_timeout: Duration::from_secs(3),
prepare_workers_soft_max_num: 1,
prepare_workers_hard_max_num: 1,
execute_worker_program_path,
execute_worker_spawn_timeout: Duration::from_secs(3),
execute_workers_max_num: 2,
@@ -200,15 +204,24 @@ impl Config {
/// The future should not return normally but if it does then that indicates an unrecoverable error.
/// In that case all pending requests will be canceled, dropping the result senders and new ones
/// will be rejected.
pub fn start(config: Config, metrics: Metrics) -> (ValidationHost, impl Future<Output = ()>) {
pub async fn start(config: Config, metrics: Metrics) -> (ValidationHost, impl Future<Output = ()>) {
gum::debug!(target: LOG_TARGET, ?config, "starting PVF validation host");
// Run checks for supported security features once per host startup. Warn here if not enabled.
let security_status = {
let can_enable_landlock = check_landlock(&config.prepare_worker_program_path);
let can_unshare_user_namespace_and_change_root =
check_can_unshare_user_namespace_and_change_root(&config.prepare_worker_program_path);
SecurityStatus { can_enable_landlock, can_unshare_user_namespace_and_change_root }
// TODO: add check that syslog is available and that seccomp violations are logged?
let (can_enable_landlock, can_enable_seccomp, can_unshare_user_namespace_and_change_root) = join!(
security::check_landlock(&config.prepare_worker_program_path),
security::check_seccomp(&config.prepare_worker_program_path),
security::check_can_unshare_user_namespace_and_change_root(
&config.prepare_worker_program_path
)
);
SecurityStatus {
can_enable_landlock,
can_enable_seccomp,
can_unshare_user_namespace_and_change_root,
}
};
let (to_host_tx, to_host_rx) = mpsc::channel(10);
@@ -882,105 +895,6 @@ fn pulse_every(interval: std::time::Duration) -> impl futures::Stream<Item = ()>
.map(|_| ())
}
/// Check if we can sandbox the root and emit a warning if not.
///
/// We do this check by spawning a new process and trying to sandbox it. To get as close as possible
/// to running the check in a worker, we try it... in a worker. The expected return status is 0 on
/// success and -1 on failure.
fn check_can_unshare_user_namespace_and_change_root(
#[cfg_attr(not(target_os = "linux"), allow(unused_variables))]
prepare_worker_program_path: &Path,
) -> bool {
cfg_if::cfg_if! {
if #[cfg(target_os = "linux")] {
let output = std::process::Command::new(prepare_worker_program_path)
.arg("--check-can-unshare-user-namespace-and-change-root")
.output();
match output {
Ok(output) if output.status.success() => true,
Ok(output) => {
let stderr = std::str::from_utf8(&output.stderr)
.expect("child process writes a UTF-8 string to stderr; qed")
.trim();
gum::warn!(
target: LOG_TARGET,
?prepare_worker_program_path,
// Docs say to always print status using `Display` implementation.
status = %output.status,
%stderr,
"Cannot unshare user namespace and change root, which are Linux-specific kernel security features. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running with support for unsharing user namespaces for maximum security."
);
false
},
Err(err) => {
gum::warn!(
target: LOG_TARGET,
?prepare_worker_program_path,
"Could not start child process: {}",
err
);
false
},
}
} else {
gum::warn!(
target: LOG_TARGET,
"Cannot unshare user namespace and change root, which are Linux-specific kernel security features. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running on Linux with support for unsharing user namespaces for maximum security."
);
false
}
}
}
/// Check if landlock is supported and emit a warning if not.
///
/// We do this check by spawning a new process and trying to sandbox it. To get as close as possible
/// to running the check in a worker, we try it... in a worker. The expected return status is 0 on
/// success and -1 on failure.
fn check_landlock(
#[cfg_attr(not(target_os = "linux"), allow(unused_variables))]
prepare_worker_program_path: &Path,
) -> bool {
cfg_if::cfg_if! {
if #[cfg(target_os = "linux")] {
match std::process::Command::new(prepare_worker_program_path)
.arg("--check-can-enable-landlock")
.status()
{
Ok(status) if status.success() => true,
Ok(status) => {
let abi =
polkadot_node_core_pvf_common::worker::security::landlock::LANDLOCK_ABI as u8;
gum::warn!(
target: LOG_TARGET,
?prepare_worker_program_path,
?status,
%abi,
"Cannot fully enable landlock, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider upgrading the kernel version for maximum security."
);
false
},
Err(err) => {
gum::warn!(
target: LOG_TARGET,
?prepare_worker_program_path,
"Could not start child process: {}",
err
);
false
},
}
} else {
gum::warn!(
target: LOG_TARGET,
"Cannot enable landlock, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running on Linux with landlock support for maximum security."
);
false
}
}
}
#[cfg(test)]
pub(crate) mod tests {
use super::*;
+1
View File
@@ -97,6 +97,7 @@ mod host;
mod metrics;
mod prepare;
mod priority;
mod security;
mod worker_intf;
#[cfg(feature = "test-utils")]
@@ -18,6 +18,7 @@
use crate::{
metrics::Metrics,
security,
worker_intf::{
clear_worker_dir_path, framed_recv, framed_send, spawn_with_program_path, IdleWorker,
SpawnErr, WorkerDir, WorkerHandle, JOB_TIMEOUT_WALL_CLOCK_FACTOR,
@@ -126,7 +127,9 @@ pub async fn start_work(
pid,
|tmp_artifact_file, mut stream, worker_dir| async move {
let preparation_timeout = pvf.prep_timeout();
if let Err(err) = send_request(&mut stream, pvf).await {
let audit_log_file = security::AuditLogFile::try_open_and_seek_to_end().await;
if let Err(err) = send_request(&mut stream, pvf.clone()).await {
gum::warn!(
target: LOG_TARGET,
worker_pid = %pid,
@@ -150,7 +153,19 @@ pub async fn start_work(
match result {
// Received bytes from worker within the time limit.
Ok(Ok(prepare_result)) =>
Ok(Ok(prepare_result)) => {
// Check if any syscall violations occurred during the job. For now this is only
// informative, as we are not enforcing the seccomp policy yet.
for syscall in security::check_seccomp_violations_for_worker(audit_log_file, pid).await {
gum::error!(
target: LOG_TARGET,
worker_pid = %pid,
%syscall,
?pvf,
"A forbidden syscall was attempted! This is a violation of our seccomp security policy. Report an issue ASAP!"
);
}
handle_response(
metrics,
IdleWorker { stream, pid, worker_dir },
@@ -160,7 +175,8 @@ pub async fn start_work(
artifact_path,
preparation_timeout,
)
.await,
.await
},
Ok(Err(err)) => {
// Communication error within the time limit.
gum::warn!(
@@ -169,6 +185,21 @@ pub async fn start_work(
"failed to recv a prepare response: {:?}",
err,
);
// The worker died. Check if it was due to a seccomp violation.
//
// NOTE: Log, but don't change the outcome. Not all validators may have auditing
// enabled, so we don't want attackers to abuse a non-deterministic outcome.
for syscall in security::check_seccomp_violations_for_worker(audit_log_file, pid).await {
gum::error!(
target: LOG_TARGET,
worker_pid = %pid,
%syscall,
?pvf,
"A forbidden syscall was attempted! This is a violation of our seccomp security policy. Report an issue ASAP!"
);
}
Outcome::IoErr(err.to_string())
},
Err(_) => {
+312
View File
@@ -0,0 +1,312 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Polkadot.
// Polkadot is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Polkadot is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
use crate::LOG_TARGET;
use std::path::Path;
use tokio::{
fs::{File, OpenOptions},
io::{AsyncReadExt, AsyncSeekExt, SeekFrom},
};
/// Check if we can sandbox the root and emit a warning if not.
///
/// We do this check by spawning a new process and trying to sandbox it. To get as close as possible
/// to running the check in a worker, we try it... in a worker. The expected return status is 0 on
/// success and -1 on failure.
pub async fn check_can_unshare_user_namespace_and_change_root(
#[cfg_attr(not(target_os = "linux"), allow(unused_variables))]
prepare_worker_program_path: &Path,
) -> bool {
cfg_if::cfg_if! {
if #[cfg(target_os = "linux")] {
match tokio::process::Command::new(prepare_worker_program_path)
.arg("--check-can-unshare-user-namespace-and-change-root")
.output()
.await
{
Ok(output) if output.status.success() => true,
Ok(output) => {
let stderr = std::str::from_utf8(&output.stderr)
.expect("child process writes a UTF-8 string to stderr; qed")
.trim();
gum::warn!(
target: LOG_TARGET,
?prepare_worker_program_path,
// Docs say to always print status using `Display` implementation.
status = %output.status,
%stderr,
"Cannot unshare user namespace and change root, which are Linux-specific kernel security features. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running with support for unsharing user namespaces for maximum security."
);
false
},
Err(err) => {
gum::warn!(
target: LOG_TARGET,
?prepare_worker_program_path,
"Could not start child process: {}",
err
);
false
},
}
} else {
gum::warn!(
target: LOG_TARGET,
"Cannot unshare user namespace and change root, which are Linux-specific kernel security features. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running on Linux with support for unsharing user namespaces for maximum security."
);
false
}
}
}
/// Check if landlock is supported and emit a warning if not.
///
/// We do this check by spawning a new process and trying to sandbox it. To get as close as possible
/// to running the check in a worker, we try it... in a worker. The expected return status is 0 on
/// success and -1 on failure.
pub async fn check_landlock(
#[cfg_attr(not(target_os = "linux"), allow(unused_variables))]
prepare_worker_program_path: &Path,
) -> bool {
cfg_if::cfg_if! {
if #[cfg(target_os = "linux")] {
match tokio::process::Command::new(prepare_worker_program_path)
.arg("--check-can-enable-landlock")
.status()
.await
{
Ok(status) if status.success() => true,
Ok(status) => {
let abi =
polkadot_node_core_pvf_common::worker::security::landlock::LANDLOCK_ABI as u8;
gum::warn!(
target: LOG_TARGET,
?prepare_worker_program_path,
?status,
%abi,
"Cannot fully enable landlock, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider upgrading the kernel version for maximum security."
);
false
},
Err(err) => {
gum::warn!(
target: LOG_TARGET,
?prepare_worker_program_path,
"Could not start child process: {}",
err
);
false
},
}
} else {
gum::warn!(
target: LOG_TARGET,
"Cannot enable landlock, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running on Linux with landlock support for maximum security."
);
false
}
}
}
/// Check if seccomp is supported and emit a warning if not.
///
/// We do this check by spawning a new process and trying to sandbox it. To get as close as possible
/// to running the check in a worker, we try it... in a worker. The expected return status is 0 on
/// success and -1 on failure.
pub async fn check_seccomp(
#[cfg_attr(not(target_os = "linux"), allow(unused_variables))]
prepare_worker_program_path: &Path,
) -> bool {
cfg_if::cfg_if! {
if #[cfg(target_os = "linux")] {
match tokio::process::Command::new(prepare_worker_program_path)
.arg("--check-can-enable-seccomp")
.status()
.await
{
Ok(status) if status.success() => true,
Ok(status) => {
gum::warn!(
target: LOG_TARGET,
?prepare_worker_program_path,
?status,
"Cannot fully enable seccomp, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider upgrading the kernel version for maximum security."
);
false
},
Err(err) => {
gum::warn!(
target: LOG_TARGET,
?prepare_worker_program_path,
"Could not start child process: {}",
err
);
false
},
}
} else {
gum::warn!(
target: LOG_TARGET,
"Cannot enable seccomp, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running on Linux with seccomp support for maximum security."
);
false
}
}
}
const AUDIT_LOG_PATH: &'static str = "/var/log/audit/audit.log";
const SYSLOG_PATH: &'static str = "/var/log/syslog";
/// System audit log.
pub struct AuditLogFile {
file: File,
path: &'static str,
}
impl AuditLogFile {
/// Looks for an audit log file on the system and opens it, seeking to the end to skip any
/// events from before this was called.
///
/// A bit of a verbose name, but it should clue future refactorers not to move calls closer to
/// where the `AuditLogFile` is used.
pub async fn try_open_and_seek_to_end() -> Option<Self> {
let mut path = AUDIT_LOG_PATH;
let mut file = match OpenOptions::new().read(true).open(AUDIT_LOG_PATH).await {
Ok(file) => Ok(file),
Err(_) => {
path = SYSLOG_PATH;
OpenOptions::new().read(true).open(SYSLOG_PATH).await
},
}
.ok()?;
let _pos = file.seek(SeekFrom::End(0)).await;
Some(Self { file, path })
}
async fn read_new_since_open(mut self) -> String {
let mut buf = String::new();
let _len = self.file.read_to_string(&mut buf).await;
buf
}
}
/// Check if a seccomp violation occurred for the given worker. As the syslog may be in a different
/// location, or seccomp auditing may be disabled, this function provides a best-effort attempt
/// only.
///
/// The `audit_log_file` must have been obtained before the job started. It only allows reading
/// entries that were written since it was obtained, so that we do not consider events from previous
/// processes with the same pid. This can still be racy, but it's unlikely and fine for a
/// best-effort attempt.
pub async fn check_seccomp_violations_for_worker(
audit_log_file: Option<AuditLogFile>,
worker_pid: u32,
) -> Vec<u32> {
let audit_event_pid_field = format!("pid={worker_pid}");
let audit_log_file = match audit_log_file {
Some(file) => {
gum::debug!(
target: LOG_TARGET,
%worker_pid,
audit_log_path = ?file.path,
"checking audit log for seccomp violations",
);
file
},
None => {
gum::warn!(
target: LOG_TARGET,
%worker_pid,
"could not open either {AUDIT_LOG_PATH} or {SYSLOG_PATH} for reading audit logs"
);
return vec![]
},
};
let events = audit_log_file.read_new_since_open().await;
let mut violations = vec![];
for event in events.lines() {
if let Some(syscall) = parse_audit_log_for_seccomp_event(event, &audit_event_pid_field) {
violations.push(syscall);
}
}
violations
}
fn parse_audit_log_for_seccomp_event(event: &str, audit_event_pid_field: &str) -> Option<u32> {
const SECCOMP_AUDIT_EVENT_TYPE: &'static str = "type=1326";
// Do a series of simple .contains instead of a regex, because I'm not sure if the fields are
// guaranteed to always be in the same order.
if !event.contains(SECCOMP_AUDIT_EVENT_TYPE) || !event.contains(&audit_event_pid_field) {
return None
}
// Get the syscall. Let's avoid a dependency on regex just for this.
for field in event.split(" ") {
if let Some(syscall) = field.strip_prefix("syscall=") {
return syscall.parse::<u32>().ok()
}
}
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_audit_log_for_seccomp_event() {
let audit_event_pid_field = "pid=2559058";
assert_eq!(
parse_audit_log_for_seccomp_event(
r#"Oct 24 13:15:24 build kernel: [5883980.283910] audit: type=1326 audit(1698153324.786:23): auid=0 uid=0 gid=0 ses=2162 subj=unconfined pid=2559058 comm="polkadot-prepar" exe="/root/paritytech/polkadot-sdk-2/target/debug/polkadot-prepare-worker" sig=31 arch=c000003e syscall=53 compat=0 ip=0x7f7542c80d5e code=0x80000000"#,
audit_event_pid_field
),
Some(53)
);
// pid is wrong
assert_eq!(
parse_audit_log_for_seccomp_event(
r#"Oct 24 13:15:24 build kernel: [5883980.283910] audit: type=1326 audit(1698153324.786:23): auid=0 uid=0 gid=0 ses=2162 subj=unconfined pid=2559057 comm="polkadot-prepar" exe="/root/paritytech/polkadot-sdk-2/target/debug/polkadot-prepare-worker" sig=31 arch=c000003e syscall=53 compat=0 ip=0x7f7542c80d5e code=0x80000000"#,
audit_event_pid_field
),
None
);
// type is wrong
assert_eq!(
parse_audit_log_for_seccomp_event(
r#"Oct 24 13:15:24 build kernel: [5883980.283910] audit: type=1327 audit(1698153324.786:23): auid=0 uid=0 gid=0 ses=2162 subj=unconfined pid=2559057 comm="polkadot-prepar" exe="/root/paritytech/polkadot-sdk-2/target/debug/polkadot-prepare-worker" sig=31 arch=c000003e syscall=53 compat=0 ip=0x7f7542c80d5e code=0x80000000"#,
audit_event_pid_field
),
None
);
// no syscall field
assert_eq!(
parse_audit_log_for_seccomp_event(
r#"Oct 24 13:15:24 build kernel: [5883980.283910] audit: type=1327 audit(1698153324.786:23): auid=0 uid=0 gid=0 ses=2162 subj=unconfined pid=2559057 comm="polkadot-prepar" exe="/root/paritytech/polkadot-sdk-2/target/debug/polkadot-prepare-worker" sig=31 arch=c000003e compat=0 ip=0x7f7542c80d5e code=0x80000000"#,
audit_event_pid_field
),
None
);
}
}
+4 -1
View File
@@ -245,7 +245,7 @@ pub enum SpawnErr {
/// has been terminated. Since the worker is running in another process it is obviously not
/// necessary to poll this future to make the worker run, it's only for termination detection.
///
/// This future relies on the fact that a child process's stdout `fd` is closed upon it's
/// This future relies on the fact that a child process's stdout `fd` is closed upon its
/// termination.
#[pin_project]
pub struct WorkerHandle {
@@ -270,6 +270,9 @@ impl WorkerHandle {
if security_status.can_enable_landlock {
args.push("--can-enable-landlock".to_string());
}
if security_status.can_enable_seccomp {
args.push("--can-enable-seccomp".to_string());
}
if security_status.can_unshare_user_namespace_and_change_root {
args.push("--can-unshare-user-namespace-and-change-root".to_string());
}