Tracking/limiting memory allocator (#1192)

This commit is contained in:
s0me0ne-unkn0wn
2023-11-03 16:48:41 +01:00
committed by GitHub
parent 8cfbee706d
commit cd2d5d2579
19 changed files with 569 additions and 58 deletions
+20
View File
@@ -93,6 +93,10 @@ impl Metrics {
metrics.preparation_max_resident.observe(max_resident_kb);
metrics.preparation_max_allocated.observe(max_allocated_kb);
}
metrics
.preparation_peak_tracked_allocation
.observe((memory_stats.peak_tracked_alloc / 1024) as f64);
}
}
}
@@ -110,10 +114,14 @@ struct MetricsInner {
execution_time: prometheus::Histogram,
#[cfg(target_os = "linux")]
preparation_max_rss: prometheus::Histogram,
// Max. allocated memory, tracked by Jemallocator, polling-based
#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
preparation_max_allocated: prometheus::Histogram,
// Max. resident memory, tracked by Jemallocator, polling-based
#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
preparation_max_resident: prometheus::Histogram,
// Peak allocation value, tracked by tracking-allocator
preparation_peak_tracked_allocation: prometheus::Histogram,
}
impl metrics::Metrics for Metrics {
@@ -271,6 +279,18 @@ impl metrics::Metrics for Metrics {
)?,
registry,
)?,
preparation_peak_tracked_allocation: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"polkadot_pvf_preparation_peak_tracked_allocation",
"peak allocation observed for preparation (in kilobytes)",
).buckets(
prometheus::exponential_buckets(8192.0, 2.0, 10)
.expect("arguments are always valid; qed"),
),
)?,
registry,
)?,
};
Ok(Metrics(Some(inner)))
}
@@ -399,6 +399,20 @@ fn handle_mux(
)?;
}
Ok(())
},
Outcome::OutOfMemory => {
if attempt_retire(metrics, spawned, worker) {
reply(
from_pool,
FromPool::Concluded {
worker,
rip: true,
result: Err(PrepareError::OutOfMemory),
},
)?;
}
Ok(())
},
}
@@ -98,6 +98,8 @@ pub enum Outcome {
///
/// This doesn't return an idle worker instance, thus this worker is no longer usable.
IoErr(String),
/// The worker ran out of memory and is aborting. The worker should be ripped.
OutOfMemory,
}
/// Given the idle token of a worker and parameters of work, communicates with the worker and
@@ -234,6 +236,7 @@ async fn handle_response(
Ok(result) => result,
// Timed out on the child. This should already be logged by the child.
Err(PrepareError::TimedOut) => return Outcome::TimedOut,
Err(PrepareError::OutOfMemory) => return Outcome::OutOfMemory,
Err(_) => return Outcome::Concluded { worker, result },
};
+1 -1
View File
@@ -36,8 +36,8 @@ pub fn validate_candidate(
code: &[u8],
params: &[u8],
) -> Result<Vec<u8>, Box<dyn std::error::Error>> {
use polkadot_node_core_pvf_common::executor_intf::{prepare, prevalidate};
use polkadot_node_core_pvf_execute_worker::execute_artifact;
use polkadot_node_core_pvf_prepare_worker::{prepare, prevalidate};
let code = sp_maybe_compressed_blob::decompress(code, 10 * 1024 * 1024)
.expect("Decompressing code failed");