Preserve artifact cache unless stale (#1918)

Co-authored-by: Marcin S <marcin@realemail.net>
This commit is contained in:
Julian Eager
2023-11-20 02:04:22 +08:00
committed by GitHub
parent 794ee98049
commit b5858936e1
22 changed files with 536 additions and 245 deletions
+290 -95
View File
@@ -16,10 +16,10 @@
//! PVF artifacts (final compiled code blobs).
//!
//! # Lifecycle of an artifact
//! # Lifecycle of an artifact
//!
//! 1. During node start-up, the artifacts cache is cleaned up. This means that all local artifacts
//! stored on-disk are cleared, and we start with an empty [`Artifacts`] table.
//! 1. During node start-up, we will check the cached artifacts, if any. The stale and corrupted
//! ones are pruned. The valid ones are registered in the [`Artifacts`] table.
//!
//! 2. In order to be executed, a PVF should be prepared first. This means that artifacts should
//! have an [`ArtifactState::Prepared`] entry for that artifact in the table. If not, the
@@ -55,18 +55,29 @@
//! older by a predefined parameter. This process is run very rarely (say, once a day). Once the
//! artifact is expired it is removed from disk eagerly atomically.
use crate::host::PrepareResultSender;
use crate::{host::PrecheckResultSender, LOG_TARGET};
use always_assert::always;
use polkadot_node_core_pvf_common::{error::PrepareError, prepare::PrepareStats, pvf::PvfPrepData};
use polkadot_core_primitives::Hash;
use polkadot_node_core_pvf_common::{
error::PrepareError, prepare::PrepareStats, pvf::PvfPrepData, RUNTIME_VERSION,
};
use polkadot_node_primitives::NODE_VERSION;
use polkadot_parachain_primitives::primitives::ValidationCodeHash;
use polkadot_primitives::ExecutorParamsHash;
use std::{
collections::HashMap,
path::{Path, PathBuf},
str::FromStr as _,
time::{Duration, SystemTime},
};
const RUNTIME_PREFIX: &str = "wasmtime_v";
const NODE_PREFIX: &str = "polkadot_v";
fn artifact_prefix() -> String {
format!("{}{}_{}{}", RUNTIME_PREFIX, RUNTIME_VERSION, NODE_PREFIX, NODE_VERSION)
}
/// Identifier of an artifact. Encodes a code hash of the PVF and a hash of executor parameter set.
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct ArtifactId {
@@ -75,9 +86,6 @@ pub struct ArtifactId {
}
impl ArtifactId {
const PREFIX: &'static str = "wasmtime_";
const NODE_VERSION_PREFIX: &'static str = "polkadot_v";
/// Creates a new artifact ID with the given hash.
pub fn new(code_hash: ValidationCodeHash, executor_params_hash: ExecutorParamsHash) -> Self {
Self { code_hash, executor_params_hash }
@@ -88,38 +96,34 @@ impl ArtifactId {
Self::new(pvf.code_hash(), pvf.executor_params().hash())
}
/// Tries to recover the artifact id from the given file name.
#[cfg(test)]
pub fn from_file_name(file_name: &str) -> Option<Self> {
use polkadot_core_primitives::Hash;
use std::str::FromStr as _;
let file_name =
file_name.strip_prefix(Self::PREFIX)?.strip_prefix(Self::NODE_VERSION_PREFIX)?;
// [ node version | code hash | param hash ]
let parts: Vec<&str> = file_name.split('_').collect();
let (_node_ver, code_hash_str, executor_params_hash_str) = (parts[0], parts[1], parts[2]);
let code_hash = Hash::from_str(code_hash_str).ok()?.into();
let executor_params_hash =
ExecutorParamsHash::from_hash(Hash::from_str(executor_params_hash_str).ok()?);
Some(Self { code_hash, executor_params_hash })
}
/// Returns the expected path to this artifact given the root of the cache.
pub fn path(&self, cache_path: &Path) -> PathBuf {
/// Returns the canonical path to the concluded artifact.
pub(crate) fn path(&self, cache_path: &Path, checksum: &str) -> PathBuf {
let file_name = format!(
"{}{}{}_{:#x}_{:#x}",
Self::PREFIX,
Self::NODE_VERSION_PREFIX,
NODE_VERSION,
"{}_{:#x}_{:#x}_0x{}",
artifact_prefix(),
self.code_hash,
self.executor_params_hash
self.executor_params_hash,
checksum
);
cache_path.join(file_name)
}
/// Tries to recover the artifact id from the given file name.
/// Return `None` if the given file name is invalid.
/// VALID_NAME := <PREFIX> _ <CODE_HASH> _ <PARAM_HASH> _ <CHECKSUM>
fn from_file_name(file_name: &str) -> Option<Self> {
let file_name = file_name.strip_prefix(&artifact_prefix())?.strip_prefix('_')?;
let parts: Vec<&str> = file_name.split('_').collect();
if let [code_hash, param_hash, _checksum] = parts[..] {
let code_hash = Hash::from_str(code_hash).ok()?.into();
let executor_params_hash =
ExecutorParamsHash::from_hash(Hash::from_str(param_hash).ok()?);
return Some(Self { code_hash, executor_params_hash })
}
None
}
}
/// A bundle of the artifact ID and the path.
@@ -136,8 +140,8 @@ pub struct ArtifactPathId {
}
impl ArtifactPathId {
pub(crate) fn new(artifact_id: ArtifactId, cache_path: &Path) -> Self {
Self { path: artifact_id.path(cache_path), id: artifact_id }
pub(crate) fn new(artifact_id: ArtifactId, path: &Path) -> Self {
Self { id: artifact_id, path: path.to_owned() }
}
}
@@ -148,6 +152,8 @@ pub enum ArtifactState {
/// That means that the artifact should be accessible through the path obtained by the artifact
/// id (unless, it was removed externally).
Prepared {
/// The path of the compiled artifact.
path: PathBuf,
/// The time when the artifact was last needed.
///
/// This is updated when we get the heads up for this artifact or when we just discover
@@ -159,7 +165,7 @@ pub enum ArtifactState {
/// A task to prepare this artifact is scheduled.
Preparing {
/// List of result senders that are waiting for a response.
waiting_for_response: Vec<PrepareResultSender>,
waiting_for_response: Vec<PrecheckResultSender>,
/// The number of times this artifact has failed to prepare.
num_failures: u32,
},
@@ -177,32 +183,148 @@ pub enum ArtifactState {
/// A container of all known artifact ids and their states.
pub struct Artifacts {
artifacts: HashMap<ArtifactId, ArtifactState>,
inner: HashMap<ArtifactId, ArtifactState>,
}
impl Artifacts {
/// Initialize a blank cache at the given path. This will clear everything present at the
/// given path, to be populated over time.
///
/// The recognized artifacts will be filled in the table and unrecognized will be removed.
pub async fn new(cache_path: &Path) -> Self {
// First delete the entire cache. This includes artifacts and any leftover worker dirs (see
// [`WorkerDir`]). Nodes are long-running so this should populate shortly.
let _ = tokio::fs::remove_dir_all(cache_path).await;
// Make sure that the cache path directory and all its parents are created.
let _ = tokio::fs::create_dir_all(cache_path).await;
Self { artifacts: HashMap::new() }
#[cfg(test)]
pub(crate) fn empty() -> Self {
Self { inner: HashMap::new() }
}
#[cfg(test)]
pub(crate) fn empty() -> Self {
Self { artifacts: HashMap::new() }
pub(crate) fn len(&self) -> usize {
self.inner.len()
}
/// Create an empty table and populate it with valid artifacts as [`ArtifactState::Prepared`],
/// if any. The existing caches will be checked by their file name to determine whether they are
/// valid, e.g., matching the current node version. The ones deemed invalid will be pruned.
pub async fn new_and_prune(cache_path: &Path) -> Self {
let mut artifacts = Self { inner: HashMap::new() };
artifacts.insert_and_prune(cache_path).await;
artifacts
}
async fn insert_and_prune(&mut self, cache_path: &Path) {
async fn is_corrupted(path: &Path) -> bool {
let checksum = match tokio::fs::read(path).await {
Ok(bytes) => blake3::hash(&bytes),
Err(err) => {
// just remove the file if we cannot read it
gum::warn!(
target: LOG_TARGET,
?err,
"unable to read artifact {:?} when checking integrity, removing...",
path,
);
return true
},
};
if let Some(file_name) = path.file_name() {
if let Some(file_name) = file_name.to_str() {
return !file_name.ends_with(checksum.to_hex().as_str())
}
}
true
}
// Insert the entry into the artifacts table if it is valid.
// Otherwise, prune it.
async fn insert_or_prune(
artifacts: &mut Artifacts,
entry: &tokio::fs::DirEntry,
cache_path: &Path,
) {
let file_type = entry.file_type().await;
let file_name = entry.file_name();
match file_type {
Ok(file_type) =>
if !file_type.is_file() {
return
},
Err(err) => {
gum::warn!(
target: LOG_TARGET,
?err,
"unable to get file type for {:?}",
file_name,
);
return
},
}
if let Some(file_name) = file_name.to_str() {
let id = ArtifactId::from_file_name(file_name);
let path = cache_path.join(file_name);
if id.is_none() || is_corrupted(&path).await {
gum::warn!(
target: LOG_TARGET,
"discarding invalid artifact {:?}",
&path,
);
let _ = tokio::fs::remove_file(&path).await;
return
}
if let Some(id) = id {
gum::debug!(
target: LOG_TARGET,
"reusing existing {:?} for node version v{}",
&path,
NODE_VERSION,
);
artifacts.insert_prepared(id, path, SystemTime::now(), Default::default());
}
} else {
gum::warn!(
target: LOG_TARGET,
"non-Unicode file name {:?} found in {:?}",
file_name,
cache_path,
);
}
}
// Make sure that the cache path directory and all its parents are created.
let _ = tokio::fs::create_dir_all(cache_path).await;
let mut dir = match tokio::fs::read_dir(cache_path).await {
Ok(dir) => dir,
Err(err) => {
gum::error!(
target: LOG_TARGET,
?err,
"failed to read dir {:?}",
cache_path,
);
return
},
};
loop {
match dir.next_entry().await {
Ok(Some(entry)) => insert_or_prune(self, &entry, cache_path).await,
Ok(None) => break,
Err(err) => {
gum::warn!(
target: LOG_TARGET,
?err,
"error processing artifacts in {:?}",
cache_path,
);
break
},
}
}
}
/// Returns the state of the given artifact by its ID.
pub fn artifact_state_mut(&mut self, artifact_id: &ArtifactId) -> Option<&mut ArtifactState> {
self.artifacts.get_mut(artifact_id)
self.inner.get_mut(artifact_id)
}
/// Inform the table about the artifact with the given ID. The state will be set to "preparing".
@@ -212,53 +334,52 @@ impl Artifacts {
pub fn insert_preparing(
&mut self,
artifact_id: ArtifactId,
waiting_for_response: Vec<PrepareResultSender>,
waiting_for_response: Vec<PrecheckResultSender>,
) {
// See the precondition.
always!(self
.artifacts
.inner
.insert(artifact_id, ArtifactState::Preparing { waiting_for_response, num_failures: 0 })
.is_none());
}
/// Insert an artifact with the given ID as "prepared".
///
/// This function must be used only for brand-new artifacts and should never be used for
/// replacing existing ones.
#[cfg(test)]
pub fn insert_prepared(
/// This function should only be used to build the artifact table at startup with valid
/// artifact caches.
pub(crate) fn insert_prepared(
&mut self,
artifact_id: ArtifactId,
path: PathBuf,
last_time_needed: SystemTime,
prepare_stats: PrepareStats,
) {
// See the precondition.
always!(self
.artifacts
.insert(artifact_id, ArtifactState::Prepared { last_time_needed, prepare_stats })
.inner
.insert(artifact_id, ArtifactState::Prepared { path, last_time_needed, prepare_stats })
.is_none());
}
/// Remove and retrieve the artifacts from the table that are older than the supplied
/// Time-To-Live.
pub fn prune(&mut self, artifact_ttl: Duration) -> Vec<ArtifactId> {
/// Remove artifacts older than the given TTL and return id and path of the removed ones.
pub fn prune(&mut self, artifact_ttl: Duration) -> Vec<(ArtifactId, PathBuf)> {
let now = SystemTime::now();
let mut to_remove = vec![];
for (k, v) in self.artifacts.iter() {
if let ArtifactState::Prepared { last_time_needed, .. } = *v {
for (k, v) in self.inner.iter() {
if let ArtifactState::Prepared { last_time_needed, ref path, .. } = *v {
if now
.duration_since(last_time_needed)
.map(|age| age > artifact_ttl)
.unwrap_or(false)
{
to_remove.push(k.clone());
to_remove.push((k.clone(), path.clone()));
}
}
}
for artifact in &to_remove {
self.artifacts.remove(artifact);
self.inner.remove(&artifact.0);
}
to_remove
@@ -267,13 +388,72 @@ impl Artifacts {
#[cfg(test)]
mod tests {
use super::{ArtifactId, Artifacts, NODE_VERSION};
use super::{artifact_prefix as prefix, ArtifactId, Artifacts, NODE_VERSION, RUNTIME_VERSION};
use polkadot_primitives::ExecutorParamsHash;
use rand::Rng;
use sp_core::H256;
use std::{path::Path, str::FromStr};
use std::{
fs,
io::Write,
path::{Path, PathBuf},
str::FromStr,
};
fn file_name(code_hash: &str, param_hash: &str) -> String {
format!("wasmtime_polkadot_v{}_0x{}_0x{}", NODE_VERSION, code_hash, param_hash)
fn rand_hash(len: usize) -> String {
let mut rng = rand::thread_rng();
let hex: Vec<_> = "0123456789abcdef".chars().collect();
(0..len).map(|_| hex[rng.gen_range(0..hex.len())]).collect()
}
fn file_name(code_hash: &str, param_hash: &str, checksum: &str) -> String {
format!("{}_0x{}_0x{}_0x{}", prefix(), code_hash, param_hash, checksum)
}
fn create_artifact(
dir: impl AsRef<Path>,
prefix: &str,
code_hash: impl AsRef<str>,
params_hash: impl AsRef<str>,
) -> (PathBuf, String) {
fn artifact_path_without_checksum(
dir: impl AsRef<Path>,
prefix: &str,
code_hash: impl AsRef<str>,
params_hash: impl AsRef<str>,
) -> PathBuf {
let mut path = dir.as_ref().to_path_buf();
let file_name =
format!("{}_0x{}_0x{}", prefix, code_hash.as_ref(), params_hash.as_ref(),);
path.push(file_name);
path
}
let (code_hash, params_hash) = (code_hash.as_ref(), params_hash.as_ref());
let path = artifact_path_without_checksum(dir, prefix, code_hash, params_hash);
let mut file = fs::File::create(&path).unwrap();
let content = format!("{}{}", code_hash, params_hash).into_bytes();
file.write_all(&content).unwrap();
let checksum = blake3::hash(&content).to_hex().to_string();
(path, checksum)
}
fn create_rand_artifact(dir: impl AsRef<Path>, prefix: &str) -> (PathBuf, String) {
create_artifact(dir, prefix, rand_hash(64), rand_hash(64))
}
fn concluded_path(path: impl AsRef<Path>, checksum: &str) -> PathBuf {
let path = path.as_ref();
let mut file_name = path.file_name().unwrap().to_os_string();
file_name.push("_0x");
file_name.push(checksum);
path.with_file_name(file_name)
}
#[test]
fn artifact_prefix() {
assert_eq!(prefix(), format!("wasmtime_v{}_polkadot_v{}", RUNTIME_VERSION, NODE_VERSION));
}
#[test]
@@ -284,6 +464,7 @@ mod tests {
let file_name = file_name(
"0022800000000000000000000000000000000000000000000000000000000000",
"0033900000000000000000000000000000000000000000000000000000000000",
"00000000000000000000000000000000",
);
assert_eq!(
@@ -305,40 +486,54 @@ mod tests {
let dir = Path::new("/test");
let code_hash = "1234567890123456789012345678901234567890123456789012345678901234";
let params_hash = "4321098765432109876543210987654321098765432109876543210987654321";
let file_name = file_name(code_hash, params_hash);
let checksum = "34567890123456789012345678901234";
let file_name = file_name(code_hash, params_hash, checksum);
let code_hash = H256::from_str(code_hash).unwrap();
let params_hash = H256::from_str(params_hash).unwrap();
let path = ArtifactId::new(code_hash.into(), ExecutorParamsHash::from_hash(params_hash))
.path(dir, checksum);
assert_eq!(
ArtifactId::new(code_hash.into(), ExecutorParamsHash::from_hash(params_hash))
.path(dir)
.to_str(),
Some(format!("/test/{}", file_name).as_str()),
);
assert_eq!(path.to_str().unwrap(), format!("/test/{}", file_name));
}
#[tokio::test]
async fn artifacts_removes_cache_on_startup() {
let fake_cache_path = crate::worker_intf::tmppath("test-cache").await.unwrap();
let fake_artifact_path = {
let mut p = fake_cache_path.clone();
p.push("wasmtime_0x1234567890123456789012345678901234567890123456789012345678901234");
p
};
async fn remove_stale_cache_on_startup() {
let cache_dir = crate::worker_intf::tmppath("test-cache").await.unwrap();
fs::create_dir_all(&cache_dir).unwrap();
// create a tmp cache with 1 artifact.
// invalid prefix
create_rand_artifact(&cache_dir, "");
create_rand_artifact(&cache_dir, "wasmtime_polkadot_v");
create_rand_artifact(&cache_dir, "wasmtime_v8.0.0_polkadot_v1.0.0");
std::fs::create_dir_all(&fake_cache_path).unwrap();
std::fs::File::create(fake_artifact_path).unwrap();
let prefix = prefix();
// this should remove it and re-create.
// no checksum
create_rand_artifact(&cache_dir, &prefix);
let p = &fake_cache_path;
Artifacts::new(p).await;
// invalid hashes
let (path, checksum) = create_artifact(&cache_dir, &prefix, "000", "000001");
let new_path = concluded_path(&path, &checksum);
fs::rename(&path, &new_path).unwrap();
assert_eq!(std::fs::read_dir(&fake_cache_path).unwrap().count(), 0);
// checksum tampered
let (path, checksum) = create_rand_artifact(&cache_dir, &prefix);
let new_path = concluded_path(&path, checksum.chars().rev().collect::<String>().as_str());
fs::rename(&path, &new_path).unwrap();
std::fs::remove_dir_all(fake_cache_path).unwrap();
// valid
let (path, checksum) = create_rand_artifact(&cache_dir, &prefix);
let new_path = concluded_path(&path, &checksum);
fs::rename(&path, &new_path).unwrap();
assert_eq!(fs::read_dir(&cache_dir).unwrap().count(), 7);
let artifacts = Artifacts::new_and_prune(&cache_dir).await;
assert_eq!(fs::read_dir(&cache_dir).unwrap().count(), 1);
assert_eq!(artifacts.len(), 1);
fs::remove_dir_all(cache_dir).unwrap();
}
}