Switch to pooling copy-on-write instantiation strategy for WASM (#11232)

* Switch to pooling copy-on-write instantiation strategy for WASM * Fix benchmark compilation * Fix `cargo fmt` * Fix compilation of another benchmark I've missed * Cleanups according to review comments * Move `max_memory_size` to `Semantics` * Set `memory_guaranteed_dense_image_size` to `max_memory_size` * Rename `wasm_instantiation_strategy` to `wasmtime_instantiation_strategy` * Update the doc-comments regarding the instantiation strategy * Extend the integration tests to test every instantiation strategy * Don't drop the temporary directory until the runtime is dropped in benchmarks * Don't drop the temporary directory until the runtime is dropped in tests
2026-06-14 22:41:06 +00:00 · 2022-05-19 16:32:53 +09:00
parent b3b7b4ddc7
commit dd854c16e2
21 changed files with 726 additions and 236 deletions
@@ -17,26 +17,43 @@

 use criterion::{criterion_group, criterion_main, Criterion};

-use sc_executor_common::{runtime_blob::RuntimeBlob, wasm_runtime::WasmModule};
+use codec::Encode;
+
+use sc_executor_common::{
+	runtime_blob::RuntimeBlob,
+	wasm_runtime::{WasmInstance, WasmModule},
+};
+#[cfg(feature = "wasmtime")]
+use sc_executor_wasmtime::InstantiationStrategy;
 use sc_runtime_test::wasm_binary_unwrap as test_runtime;
 use sp_wasm_interface::HostFunctions as _;
-use std::sync::Arc;
+use std::sync::{
+	atomic::{AtomicBool, AtomicUsize, Ordering},
+	Arc,
+};

+#[derive(Clone)]
 enum Method {
 	Interpreted,
 	#[cfg(feature = "wasmtime")]
 	Compiled {
-		fast_instance_reuse: bool,
+		instantiation_strategy: InstantiationStrategy,
+		precompile: bool,
 	},
 }

-// This is just a bog-standard Kusama runtime with the extra `test_empty_return`
-// function copy-pasted from the test runtime.
+// This is just a bog-standard Kusama runtime with an extra
+// `test_empty_return` and `test_dirty_plenty_memory` functions
+// copy-pasted from the test runtime.
 fn kusama_runtime() -> &'static [u8] {
 	include_bytes!("kusama_runtime.wasm")
 }

-fn initialize(runtime: &[u8], method: Method) -> Arc<dyn WasmModule> {
+fn initialize(
+	_tmpdir: &mut Option<tempfile::TempDir>,
+	runtime: &[u8],
+	method: Method,
+) -> Arc<dyn WasmModule> {
 	let blob = RuntimeBlob::uncompress_if_needed(runtime).unwrap();
 	let host_functions = sp_io::SubstrateHostFunctions::host_functions();
 	let heap_pages = 2048;
@@ -51,80 +68,200 @@ fn initialize(runtime: &[u8], method: Method) -> Arc<dyn WasmModule> {
 		)
 		.map(|runtime| -> Arc<dyn WasmModule> { Arc::new(runtime) }),
 		#[cfg(feature = "wasmtime")]
-		Method::Compiled { fast_instance_reuse } =>
-			sc_executor_wasmtime::create_runtime::<sp_io::SubstrateHostFunctions>(
-				blob,
-				sc_executor_wasmtime::Config {
+		Method::Compiled { instantiation_strategy, precompile } => {
+			let config = sc_executor_wasmtime::Config {
+				allow_missing_func_imports,
+				cache_path: None,
+				semantics: sc_executor_wasmtime::Semantics {
+					extra_heap_pages: heap_pages,
+					instantiation_strategy,
+					deterministic_stack_limit: None,
+					canonicalize_nans: false,
+					parallel_compilation: true,
 					max_memory_size: None,
-					allow_missing_func_imports,
-					cache_path: None,
-					semantics: sc_executor_wasmtime::Semantics {
-						extra_heap_pages: heap_pages,
-						fast_instance_reuse,
-						deterministic_stack_limit: None,
-						canonicalize_nans: false,
-						parallel_compilation: true,
-					},
 				},
-			)
-			.map(|runtime| -> Arc<dyn WasmModule> { Arc::new(runtime) }),
+			};
+
+			if precompile {
+				let precompiled_blob =
+					sc_executor_wasmtime::prepare_runtime_artifact(blob, &config.semantics)
+						.unwrap();
+
+				// Create a fresh temporary directory to make absolutely sure
+				// we'll use the right module.
+				*_tmpdir = Some(tempfile::tempdir().unwrap());
+				let tmpdir = _tmpdir.as_ref().unwrap();
+
+				let path = tmpdir.path().join("module.bin");
+				std::fs::write(&path, &precompiled_blob).unwrap();
+				unsafe {
+					sc_executor_wasmtime::create_runtime_from_artifact::<
+						sp_io::SubstrateHostFunctions,
+					>(&path, config)
+				}
+			} else {
+				sc_executor_wasmtime::create_runtime::<sp_io::SubstrateHostFunctions>(blob, config)
+			}
+			.map(|runtime| -> Arc<dyn WasmModule> { Arc::new(runtime) })
+		},
 	}
 	.unwrap()
 }

+fn run_benchmark(
+	c: &mut Criterion,
+	benchmark_name: &str,
+	thread_count: usize,
+	runtime: &dyn WasmModule,
+	testcase: impl Fn(&mut Box<dyn WasmInstance>) + Copy + Send + 'static,
+) {
+	c.bench_function(benchmark_name, |b| {
+		// Here we deliberately start a bunch of extra threads which will just
+		// keep on independently instantiating the runtime over and over again.
+		//
+		// We don't really have to measure how much time those take since the
+		// work done is essentially the same on each thread, and what we're
+		// interested in here is only how those extra threads affect the execution
+		// on the current thread.
+		//
+		// In an ideal case assuming we have enough CPU cores those extra threads
+		// shouldn't affect the main thread's runtime at all, however in practice
+		// they're not completely independent. There might be per-process
+		// locks in the kernel which are briefly held during instantiation, etc.,
+		// and how much those affect the execution here is what we want to measure.
+		let is_benchmark_running = Arc::new(AtomicBool::new(true));
+		let threads_running = Arc::new(AtomicUsize::new(0));
+		let aux_threads: Vec<_> = (0..thread_count - 1)
+			.map(|_| {
+				let mut instance = runtime.new_instance().unwrap();
+				let is_benchmark_running = is_benchmark_running.clone();
+				let threads_running = threads_running.clone();
+				std::thread::spawn(move || {
+					threads_running.fetch_add(1, Ordering::SeqCst);
+					while is_benchmark_running.load(Ordering::Relaxed) {
+						testcase(&mut instance);
+					}
+				})
+			})
+			.collect();
+
+		while threads_running.load(Ordering::SeqCst) != (thread_count - 1) {
+			std::thread::yield_now();
+		}
+
+		let mut instance = runtime.new_instance().unwrap();
+		b.iter(|| testcase(&mut instance));
+
+		is_benchmark_running.store(false, Ordering::SeqCst);
+		for thread in aux_threads {
+			thread.join().unwrap();
+		}
+	});
+}
+
 fn bench_call_instance(c: &mut Criterion) {
 	let _ = env_logger::try_init();

-	#[cfg(feature = "wasmtime")]
-	{
-		let runtime = initialize(test_runtime(), Method::Compiled { fast_instance_reuse: true });
-		c.bench_function("call_instance_test_runtime_with_fast_instance_reuse", |b| {
-			let mut instance = runtime.new_instance().unwrap();
-			b.iter(|| instance.call_export("test_empty_return", &[0]).unwrap())
-		});
+	let strategies = [
+		#[cfg(feature = "wasmtime")]
+		(
+			"legacy_instance_reuse",
+			Method::Compiled {
+				instantiation_strategy: InstantiationStrategy::LegacyInstanceReuse,
+				precompile: false,
+			},
+		),
+		#[cfg(feature = "wasmtime")]
+		(
+			"recreate_instance_vanilla",
+			Method::Compiled {
+				instantiation_strategy: InstantiationStrategy::RecreateInstance,
+				precompile: false,
+			},
+		),
+		#[cfg(feature = "wasmtime")]
+		(
+			"recreate_instance_cow_fresh",
+			Method::Compiled {
+				instantiation_strategy: InstantiationStrategy::RecreateInstanceCopyOnWrite,
+				precompile: false,
+			},
+		),
+		#[cfg(feature = "wasmtime")]
+		(
+			"recreate_instance_cow_precompiled",
+			Method::Compiled {
+				instantiation_strategy: InstantiationStrategy::RecreateInstanceCopyOnWrite,
+				precompile: true,
+			},
+		),
+		#[cfg(feature = "wasmtime")]
+		(
+			"pooling_vanilla",
+			Method::Compiled {
+				instantiation_strategy: InstantiationStrategy::Pooling,
+				precompile: false,
+			},
+		),
+		#[cfg(feature = "wasmtime")]
+		(
+			"pooling_cow_fresh",
+			Method::Compiled {
+				instantiation_strategy: InstantiationStrategy::PoolingCopyOnWrite,
+				precompile: false,
+			},
+		),
+		#[cfg(feature = "wasmtime")]
+		(
+			"pooling_cow_precompiled",
+			Method::Compiled {
+				instantiation_strategy: InstantiationStrategy::PoolingCopyOnWrite,
+				precompile: true,
+			},
+		),
+		("interpreted", Method::Interpreted),
+	];
+
+	let runtimes = [("kusama_runtime", kusama_runtime()), ("test_runtime", test_runtime())];
+
+	let thread_counts = [1, 2, 4, 8, 16];
+
+	fn test_call_empty_function(instance: &mut Box<dyn WasmInstance>) {
+		instance.call_export("test_empty_return", &[0]).unwrap();
 	}

-	#[cfg(feature = "wasmtime")]
-	{
-		let runtime = initialize(test_runtime(), Method::Compiled { fast_instance_reuse: false });
-		c.bench_function("call_instance_test_runtime_without_fast_instance_reuse", |b| {
-			let mut instance = runtime.new_instance().unwrap();
-			b.iter(|| instance.call_export("test_empty_return", &[0]).unwrap());
-		});
+	fn test_dirty_1mb_of_memory(instance: &mut Box<dyn WasmInstance>) {
+		instance.call_export("test_dirty_plenty_memory", &(0, 16).encode()).unwrap();
 	}

-	#[cfg(feature = "wasmtime")]
-	{
-		let runtime = initialize(kusama_runtime(), Method::Compiled { fast_instance_reuse: true });
-		c.bench_function("call_instance_kusama_runtime_with_fast_instance_reuse", |b| {
-			let mut instance = runtime.new_instance().unwrap();
-			b.iter(|| instance.call_export("test_empty_return", &[0]).unwrap())
-		});
-	}
+	let testcases = [
+		("call_empty_function", test_call_empty_function as fn(&mut Box<dyn WasmInstance>)),
+		("dirty_1mb_of_memory", test_dirty_1mb_of_memory),
+	];

-	#[cfg(feature = "wasmtime")]
-	{
-		let runtime = initialize(kusama_runtime(), Method::Compiled { fast_instance_reuse: false });
-		c.bench_function("call_instance_kusama_runtime_without_fast_instance_reuse", |b| {
-			let mut instance = runtime.new_instance().unwrap();
-			b.iter(|| instance.call_export("test_empty_return", &[0]).unwrap());
-		});
-	}
+	let num_cpus = num_cpus::get_physical();
+	let mut tmpdir = None;

-	{
-		let runtime = initialize(test_runtime(), Method::Interpreted);
-		c.bench_function("call_instance_test_runtime_interpreted", |b| {
-			let mut instance = runtime.new_instance().unwrap();
-			b.iter(|| instance.call_export("test_empty_return", &[0]).unwrap())
-		});
-	}
+	for (strategy_name, strategy) in strategies {
+		for (runtime_name, runtime) in runtimes {
+			let runtime = initialize(&mut tmpdir, runtime, strategy.clone());

-	{
-		let runtime = initialize(kusama_runtime(), Method::Interpreted);
-		c.bench_function("call_instance_kusama_runtime_interpreted", |b| {
-			let mut instance = runtime.new_instance().unwrap();
-			b.iter(|| instance.call_export("test_empty_return", &[0]).unwrap())
-		});
+			for (testcase_name, testcase) in testcases {
+				for thread_count in thread_counts {
+					if thread_count > num_cpus {
+						// If there are not enough cores available the benchmark is pointless.
+						continue
+					}
+
+					let benchmark_name = format!(
+						"{}_from_{}_with_{}_on_{}_threads",
+						testcase_name, runtime_name, strategy_name, thread_count
+					);
+
+					run_benchmark(c, &benchmark_name, thread_count, &*runtime, testcase);
+				}
+			}
+		}
 	}
 }