Tracking/limiting memory allocator (#1192)

This commit is contained in:
s0me0ne-unkn0wn
2023-11-03 16:48:41 +01:00
committed by GitHub
parent 8cfbee706d
commit cd2d5d2579
19 changed files with 569 additions and 58 deletions
@@ -11,7 +11,9 @@ cfg-if = "1.0"
gum = { package = "tracing-gum", path = "../../../gum" }
libc = "0.2.139"
rayon = "1.5.1"
tracking-allocator = { path = "../../../tracking-allocator" }
tikv-jemalloc-ctl = { version = "0.5.0", optional = true }
tikv-jemallocator = { version = "0.5.0", optional = true }
parity-scale-codec = { version = "3.6.1", default-features = false, features = ["derive"] }
@@ -22,11 +24,22 @@ sc-executor-common = { path = "../../../../../substrate/client/executor/common"
sc-executor-wasmtime = { path = "../../../../../substrate/client/executor/wasmtime" }
[target.'cfg(target_os = "linux")'.dependencies]
tikv-jemallocator = "0.5.0"
tikv-jemalloc-ctl = "0.5.0"
[features]
builder = []
jemalloc-allocator = [
"dep:tikv-jemalloc-ctl",
"dep:tikv-jemallocator",
"polkadot-node-core-pvf-common/jemalloc-allocator",
]
[dev-dependencies]
criterion = { version = "0.4.0", default-features = false, features = ["cargo_bench_support"] }
rococo-runtime = { path = "../../../../runtime/rococo" }
sp-maybe-compressed-blob = { path = "../../../../../substrate/primitives/maybe-compressed-blob" }
[[bench]]
name = "prepare_rococo_runtime"
harness = false
@@ -0,0 +1,65 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Polkadot.
// Polkadot is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Polkadot is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
use criterion::{criterion_group, criterion_main, Criterion, SamplingMode};
use polkadot_node_core_pvf_common::{
executor_intf::{prepare, prevalidate},
prepare::PrepareJobKind,
pvf::PvfPrepData,
};
use polkadot_primitives::ExecutorParams;
use std::time::Duration;
fn do_prepare_runtime(pvf: PvfPrepData) {
let blob = match prevalidate(&pvf.code()) {
Err(err) => panic!("{:?}", err),
Ok(b) => b,
};
match prepare(blob, &pvf.executor_params()) {
Ok(_) => (),
Err(err) => panic!("{:?}", err),
}
}
fn prepare_rococo_runtime(c: &mut Criterion) {
let blob = rococo_runtime::WASM_BINARY.unwrap();
let pvf = match sp_maybe_compressed_blob::decompress(&blob, 64 * 1024 * 1024) {
Ok(code) => PvfPrepData::from_code(
code.into_owned(),
ExecutorParams::default(),
Duration::from_secs(360),
PrepareJobKind::Compilation,
),
Err(e) => {
panic!("Cannot decompress blob: {:?}", e);
},
};
let mut group = c.benchmark_group("rococo");
group.sampling_mode(SamplingMode::Flat);
group.sample_size(20);
group.measurement_time(Duration::from_secs(240));
group.bench_function("prepare Rococo runtime", |b| {
// `PvfPrepData` is designed to be cheap to clone, so cloning shouldn't affect the
// benchmark accuracy
b.iter(|| do_prepare_runtime(pvf.clone()))
});
group.finish();
}
criterion_group!(preparation, prepare_rococo_runtime);
criterion_main!(preparation);
@@ -1,42 +0,0 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Polkadot.
// Polkadot is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// Polkadot is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
//! Interface to the Substrate Executor
use polkadot_node_core_pvf_common::executor_intf::params_to_wasmtime_semantics;
use polkadot_primitives::ExecutorParams;
use sc_executor_common::runtime_blob::RuntimeBlob;
/// Runs the prevalidation on the given code. Returns a [`RuntimeBlob`] if it succeeds.
pub fn prevalidate(code: &[u8]) -> Result<RuntimeBlob, sc_executor_common::error::WasmError> {
let blob = RuntimeBlob::new(code)?;
// It's assumed this function will take care of any prevalidation logic
// that needs to be done.
//
// Do nothing for now.
Ok(blob)
}
/// Runs preparation on the given runtime blob. If successful, it returns a serialized compiled
/// artifact which can then be used to pass into `Executor::execute` after writing it to the disk.
pub fn prepare(
blob: RuntimeBlob,
executor_params: &ExecutorParams,
) -> Result<Vec<u8>, sc_executor_common::error::WasmError> {
let semantics = params_to_wasmtime_semantics(executor_params)
.map_err(|e| sc_executor_common::error::WasmError::Other(e))?;
sc_executor_wasmtime::prepare_runtime_artifact(blob, &semantics)
}
@@ -16,10 +16,9 @@
//! Contains the logic for preparing PVFs. Used by the polkadot-prepare-worker binary.
mod executor_intf;
mod memory_stats;
pub use executor_intf::{prepare, prevalidate};
use polkadot_node_core_pvf_common::executor_intf::{prepare, prevalidate};
// NOTE: Initializing logging in e.g. tests will not have an effect in the workers, as they are
// separate spawned processes. Run with e.g. `RUST_LOG=parachain::pvf-prepare-worker=trace`.
@@ -31,7 +30,7 @@ use crate::memory_stats::max_rss_stat::{extract_max_rss_stat, get_max_rss_thread
use crate::memory_stats::memory_tracker::{get_memory_tracker_loop_stats, memory_tracker_loop};
use parity_scale_codec::{Decode, Encode};
use polkadot_node_core_pvf_common::{
error::{PrepareError, PrepareResult},
error::{PrepareError, PrepareResult, OOM_PAYLOAD},
executor_intf::create_runtime_from_artifact_bytes,
framed_recv_blocking, framed_send_blocking,
prepare::{MemoryStats, PrepareJobKind, PrepareStats},
@@ -46,11 +45,24 @@ use polkadot_node_core_pvf_common::{
use polkadot_primitives::ExecutorParams;
use std::{
fs, io,
os::unix::net::UnixStream,
os::{
fd::{AsRawFd, RawFd},
unix::net::UnixStream,
},
path::PathBuf,
sync::{mpsc::channel, Arc},
time::Duration,
};
use tracking_allocator::TrackingAllocator;
#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
#[global_allocator]
static ALLOC: TrackingAllocator<tikv_jemallocator::Jemalloc> =
TrackingAllocator(tikv_jemallocator::Jemalloc);
#[cfg(not(any(target_os = "linux", feature = "jemalloc-allocator")))]
#[global_allocator]
static ALLOC: TrackingAllocator<std::alloc::System> = TrackingAllocator(std::alloc::System);
/// Contains the bytes for a successfully compiled artifact.
pub struct CompiledArtifact(Vec<u8>);
@@ -83,6 +95,44 @@ fn send_response(stream: &mut UnixStream, result: PrepareResult) -> io::Result<(
framed_send_blocking(stream, &result.encode())
}
fn start_memory_tracking(fd: RawFd, limit: Option<isize>) {
unsafe {
// SAFETY: Inside the failure handler, the allocator is locked and no allocations or
// deallocations are possible. For Linux, that always holds for the code below, so it's
// safe. For MacOS, that technically holds at the time of writing, but there are no future
// guarantees.
// The arguments of unsafe `libc` calls are valid, the payload validity is covered with
// a test.
ALLOC.start_tracking(
limit,
Some(Box::new(move || {
#[cfg(target_os = "linux")]
{
// Syscalls never allocate or deallocate, so this is safe.
libc::syscall(libc::SYS_write, fd, OOM_PAYLOAD.as_ptr(), OOM_PAYLOAD.len());
libc::syscall(libc::SYS_close, fd);
libc::syscall(libc::SYS_exit, 1);
}
#[cfg(not(target_os = "linux"))]
{
// Syscalls are not available on MacOS, so we have to use `libc` wrappers.
// Technicaly, there may be allocations inside, although they shouldn't be
// there. In that case, we'll see deadlocks on MacOS after the OOM condition
// triggered. As we consider running a validator on MacOS unsafe, and this
// code is only run by a validator, it's a lesser evil.
libc::write(fd, OOM_PAYLOAD.as_ptr().cast(), OOM_PAYLOAD.len());
libc::close(fd);
std::process::exit(1);
}
})),
);
}
}
fn end_memory_tracking() -> isize {
ALLOC.end_tracking()
}
/// The entrypoint that the spawned prepare worker should start with.
///
/// # Parameters
@@ -172,6 +222,22 @@ pub fn worker_entrypoint(
Arc::clone(&condvar),
WaitOutcome::TimedOut,
)?;
start_memory_tracking(
stream.as_raw_fd(),
executor_params.prechecking_max_memory().map(|v| {
v.try_into().unwrap_or_else(|_| {
gum::warn!(
LOG_TARGET,
%worker_pid,
"Illegal pre-checking max memory value {} discarded",
v,
);
0
})
}),
);
// Spawn another thread for preparation.
let prepare_thread = thread::spawn_worker_thread(
"prepare thread",
@@ -207,6 +273,17 @@ pub fn worker_entrypoint(
let outcome = thread::wait_for_threads(condvar);
let peak_alloc = {
let peak = end_memory_tracking();
gum::debug!(
target: LOG_TARGET,
%worker_pid,
"prepare job peak allocation is {} bytes",
peak,
);
peak
};
let result = match outcome {
WaitOutcome::Finished => {
let _ = cpu_time_monitor_tx.send(());
@@ -238,6 +315,14 @@ pub fn worker_entrypoint(
memory_tracker_stats,
#[cfg(target_os = "linux")]
max_rss: extract_max_rss_stat(max_rss, worker_pid),
// Negative peak allocation values are legit; they are narrow
// corner cases and shouldn't affect overall statistics
// significantly
peak_tracked_alloc: if peak_alloc > 0 {
peak_alloc as u64
} else {
0u64
},
};
// Write the serialized artifact into a temp file.