add --num-cpus option

This commit is contained in:
James Wilson
2021-08-05 11:03:53 +01:00
parent 6221cbfd17
commit 759d28b1a2
8 changed files with 94 additions and 32 deletions
+2
View File
@@ -1271,6 +1271,7 @@ dependencies = [
"http", "http",
"hyper", "hyper",
"log", "log",
"num_cpus",
"once_cell", "once_cell",
"parking_lot", "parking_lot",
"primitive-types", "primitive-types",
@@ -1301,6 +1302,7 @@ dependencies = [
"http", "http",
"hyper", "hyper",
"log", "log",
"num_cpus",
"primitive-types", "primitive-types",
"serde", "serde",
"serde_json", "serde_json",
+1
View File
@@ -16,6 +16,7 @@ hex = "0.4.3"
http = "0.2.4" http = "0.2.4"
hyper = "0.14.11" hyper = "0.14.11"
log = "0.4.14" log = "0.4.14"
num_cpus = "1.13.0"
once_cell = "1.8.0" once_cell = "1.8.0"
parking_lot = "0.11.1" parking_lot = "0.11.1"
primitive-types = { version = "0.9.0", features = ["serde"] } primitive-types = { version = "0.9.0", features = ["serde"] }
@@ -113,7 +113,7 @@ impl Aggregator {
/// Return a sink that a feed can send messages into to be handled by the aggregator. /// Return a sink that a feed can send messages into to be handled by the aggregator.
pub fn subscribe_feed( pub fn subscribe_feed(
&self, &self,
) -> impl Sink<inner_loop::FromFeedWebsocket, Error = anyhow::Error> + Send + Sync + Unpin + 'static ) -> (u64, impl Sink<inner_loop::FromFeedWebsocket, Error = anyhow::Error> + Send + Sync + Unpin + 'static)
{ {
// Assign a unique aggregator-local ID to each connection that subscribes, and pass // Assign a unique aggregator-local ID to each connection that subscribes, and pass
// that along with every message to the aggregator loop: // that along with every message to the aggregator loop:
@@ -125,11 +125,11 @@ impl Aggregator {
// Calling `send` on this Sink requires Unpin. There may be a nicer way than this, // Calling `send` on this Sink requires Unpin. There may be a nicer way than this,
// but pinning by boxing is the easy solution for now: // but pinning by boxing is the easy solution for now:
Box::pin(tx_to_aggregator.with(move |msg| async move { (feed_conn_id, Box::pin(tx_to_aggregator.with(move |msg| async move {
Ok(inner_loop::ToAggregator::FromFeedWebsocket( Ok(inner_loop::ToAggregator::FromFeedWebsocket(
feed_conn_id.into(), feed_conn_id.into(),
msg, msg,
)) ))
})) })))
} }
} }
+48 -19
View File
@@ -19,7 +19,6 @@ mod feed_message;
mod find_location; mod find_location;
mod state; mod state;
use std::str::FromStr; use std::str::FromStr;
use std::sync::atomic::AtomicUsize;
use tokio::time::{Duration, Instant}; use tokio::time::{Duration, Instant};
use aggregator::{ use aggregator::{
@@ -60,10 +59,13 @@ struct Opts {
/// to a feed, the feed connection will be closed. /// to a feed, the feed connection will be closed.
#[structopt(long, default_value = "10")] #[structopt(long, default_value = "10")]
feed_timeout: u64, feed_timeout: u64,
/// Number of worker threads to spawn. Defaults to the number of CPUs on the machine.
/// If "0" is given, use the number of CPUs available on the machine.
#[structopt(long)]
num_cpus: Option<usize>,
} }
#[tokio::main] fn main() {
async fn main() {
let opts = Opts::from_args(); let opts = Opts::from_args();
SimpleLogger::new() SimpleLogger::new()
@@ -73,9 +75,20 @@ async fn main() {
log::info!("Starting Telemetry Core version: {}", VERSION); log::info!("Starting Telemetry Core version: {}", VERSION);
if let Err(e) = start_server(opts).await { let num_cpus_to_use = opts.num_cpus
log::error!("Error starting server: {}", e); .and_then(|n| if n == 0 { None } else { Some(n) })
} .unwrap_or_else(|| num_cpus::get());
tokio::runtime::Builder::new_multi_thread()
.enable_all()
.worker_threads(num_cpus_to_use)
.build()
.unwrap()
.block_on(async {
if let Err(e) = start_server(opts).await {
log::error!("Error starting server: {}", e);
}
});
} }
/// Declare our routes and start the server. /// Declare our routes and start the server.
@@ -95,13 +108,14 @@ async fn start_server(opts: Opts) -> anyhow::Result<()> {
Ok(http_utils::upgrade_to_websocket( Ok(http_utils::upgrade_to_websocket(
req, req,
move |ws_send, ws_recv| async move { move |ws_send, ws_recv| async move {
let tx_to_aggregator = aggregator.subscribe_feed(); let (feed_id, tx_to_aggregator) = aggregator.subscribe_feed();
let (mut tx_to_aggregator, mut ws_send) = let (mut tx_to_aggregator, mut ws_send) =
handle_feed_websocket_connection( handle_feed_websocket_connection(
ws_send, ws_send,
ws_recv, ws_recv,
tx_to_aggregator, tx_to_aggregator,
feed_timeout, feed_timeout,
feed_id,
) )
.await; .await;
log::info!("Closing /feed connection from {:?}", addr); log::info!("Closing /feed connection from {:?}", addr);
@@ -291,6 +305,7 @@ async fn handle_feed_websocket_connection<S>(
mut ws_recv: http_utils::WsReceiver, mut ws_recv: http_utils::WsReceiver,
mut tx_to_aggregator: S, mut tx_to_aggregator: S,
feed_timeout: u64, feed_timeout: u64,
feed_id: u64
) -> (S, http_utils::WsSender) ) -> (S, http_utils::WsSender)
where where
S: futures::Sink<FromFeedWebsocket, Error = anyhow::Error> + Unpin + Send + 'static, S: futures::Sink<FromFeedWebsocket, Error = anyhow::Error> + Unpin + Send + 'static,
@@ -364,34 +379,48 @@ where
drop(send_closer_tx); // Kill the send task if this recv task ends drop(send_closer_tx); // Kill the send task if this recv task ends
tx_to_aggregator tx_to_aggregator
}); });
let mut i: u64 = 0;
// Send messages to the feed: // Send messages to the feed:
let send_handle = tokio::spawn(async move { let send_handle = tokio::spawn(async move {
'outer: loop { 'outer: loop {
let debounce = tokio::time::sleep_until(Instant::now() + Duration::from_millis(75));
let msgs = tokio::select! { let msgs = tokio::select! {
msgs = rx_from_aggregator_chunks.next() => msgs, msgs = rx_from_aggregator_chunks.next() => msgs,
_ = &mut send_closer_rx => { break } _ = &mut send_closer_rx => { break }
}; };
// End the loop when connection from aggregator ends: // End the loop when connection from aggregator ends:
let msgs = match msgs { let msgs = match msgs {
Some(msgs) => msgs, Some(msgs) => msgs,
None => break, None => break,
}; };
let total_val = unsafe { total.load(std::sync::atomic::Ordering::Relaxed) }; if feed_id == 1 {
if msgs.len() > total_val { i += 1;
unsafe { total.compare_exchange(total_val, msgs.len(), std::sync::atomic::Ordering::Relaxed, std::sync::atomic::Ordering::Relaxed); }; println!("FEED #{}, msgs: {}", i, msgs.len());
println!("Max msgs: {}", msgs.len()); if i > 1000 {
log::error!("TESTING: close feed");
break
}
} }
// End the loop when there are more than 10k messages queued up.
// This number is just picked as a fairly high limit that should account
// for many thousands of nodes on a chain. The higher this number is, the
// larger our channel storage and memory usage is liable to grow before the feed
// is dropped.
if msgs.len() > 100_000 {
log::warn!("Closing feed websocket that was too slow to keep up (too many messages buffered)");
break 'outer;
}
// There is only one message type at the mo; bytes to send // There is only one message type at the mo; bytes to send
// to the websocket. collect them all up to dispatch in one shot. // to the websocket. collect them all up to dispatch in one shot.
let all_msg_bytes = msgs.into_iter().map(|msg| match msg { let all_msg_bytes = msgs.into_iter().map(|msg| match msg {
ToFeedWebsocket::Bytes(bytes) => bytes, ToFeedWebsocket::Bytes(bytes) => bytes,
}); });
// We have a deadline to send and flush messages. If the client isn't keeping up with our // If the feed is too slow to receive the current batch of messages, we'll drop it.
// messages, the number we obtain from `ReadyChunksAll` will gradually increase and eventually
// we'll hit this deadline and the client will be booted.
let message_send_deadline = Instant::now() + Duration::from_secs(feed_timeout); let message_send_deadline = Instant::now() + Duration::from_secs(feed_timeout);
for bytes in all_msg_bytes { for bytes in all_msg_bytes {
@@ -399,7 +428,7 @@ if msgs.len() > total_val {
.await .await
{ {
Err(_) => { Err(_) => {
log::warn!("Closing feed websocket that was too slow to keep up (1)"); log::warn!("Closing feed websocket that was too slow to keep up (too slow to send messages)");
break 'outer; break 'outer;
} }
Ok(Err(e)) => { Ok(Err(e)) => {
@@ -411,7 +440,7 @@ if msgs.len() > total_val {
} }
match tokio::time::timeout_at(message_send_deadline, ws_send.flush()).await { match tokio::time::timeout_at(message_send_deadline, ws_send.flush()).await {
Err(_) => { Err(_) => {
log::warn!("Closing feed websocket that was too slow to keep up (2)"); log::warn!("Closing feed websocket that was too slow to keep up (too slow to flush messages)");
break; break;
} }
Ok(Err(e)) => { Ok(Err(e)) => {
@@ -420,6 +449,8 @@ if msgs.len() > total_val {
} }
Ok(_) => {} Ok(_) => {}
} }
debounce.await;
} }
drop(recv_closer_tx); // Kill the recv task if this send task ends drop(recv_closer_tx); // Kill the recv task if this send task ends
@@ -434,5 +465,3 @@ if msgs.len() > total_val {
// loop ended; give socket back to parent: // loop ended; give socket back to parent:
(tx_to_aggregator, ws_send) (tx_to_aggregator, ws_send)
} }
static mut total: std::sync::atomic::AtomicUsize = AtomicUsize::new(0);
+5 -4
View File
@@ -22,10 +22,10 @@ able to open a large number of connections and run some of the tests.
Try running these: Try running these:
```sh ```sh
sudo sysctl -w kern.maxfiles=50000 sudo sysctl -w kern.maxfiles=100000
sudo sysctl -w kern.maxfilesperproc=50000 sudo sysctl -w kern.maxfilesperproc=100000
ulimit -n 50000 ulimit -n 100000
sudo sysctl -w kern.ipc.somaxconn=50000 sudo sysctl -w kern.ipc.somaxconn=100000
sudo sysctl -w kern.ipc.maxsockbuf=16777216 sudo sysctl -w kern.ipc.maxsockbuf=16777216
``` ```
*/ */
@@ -580,6 +580,7 @@ async fn slow_feeds_are_disconnected() {
// Timeout faster so the test can be quicker: // Timeout faster so the test can be quicker:
CoreOpts { CoreOpts {
feed_timeout: Some(1), feed_timeout: Some(1),
..Default::default()
}, },
// Allow us to send more messages in more easily: // Allow us to send more messages in more easily:
ShardOpts { ShardOpts {
+1
View File
@@ -14,6 +14,7 @@ hex = "0.4.3"
http = "0.2.4" http = "0.2.4"
hyper = "0.14.11" hyper = "0.14.11"
log = "0.4.14" log = "0.4.14"
num_cpus = "1.13.0"
primitive-types = { version = "0.9.0", features = ["serde"] } primitive-types = { version = "0.9.0", features = ["serde"] }
serde = { version = "1.0.126", features = ["derive"] } serde = { version = "1.0.126", features = ["derive"] }
serde_json = "1.0.64" serde_json = "1.0.64"
+19 -5
View File
@@ -80,10 +80,13 @@ struct Opts {
/// value prevented from reconnecting to this shard for, in seconds. /// value prevented from reconnecting to this shard for, in seconds.
#[structopt(long, default_value = "600")] #[structopt(long, default_value = "600")]
node_block_seconds: u64, node_block_seconds: u64,
/// Number of worker threads to spawn. Defaults to the number of CPUs on the machine.
/// If "0" is given, use the number of CPUs available on the machine.
#[structopt(long)]
num_cpus: Option<usize>,
} }
#[tokio::main] fn main() {
async fn main() {
let opts = Opts::from_args(); let opts = Opts::from_args();
SimpleLogger::new() SimpleLogger::new()
@@ -93,9 +96,20 @@ async fn main() {
log::info!("Starting Telemetry Shard version: {}", VERSION); log::info!("Starting Telemetry Shard version: {}", VERSION);
if let Err(e) = start_server(opts).await { let num_cpus_to_use = opts.num_cpus
log::error!("Error starting server: {}", e); .and_then(|n| if n == 0 { None } else { Some(n) })
} .unwrap_or_else(|| num_cpus::get());
tokio::runtime::Builder::new_multi_thread()
.enable_all()
.worker_threads(num_cpus_to_use)
.build()
.unwrap()
.block_on(async {
if let Err(e) = start_server(opts).await {
log::error!("Error starting server: {}", e);
}
});
} }
/// Declare our routes and start the server. /// Declare our routes and start the server.
@@ -20,11 +20,15 @@ use crate::server::{self, Command, Server};
/// Additional options to pass to the core command. /// Additional options to pass to the core command.
pub struct CoreOpts { pub struct CoreOpts {
pub feed_timeout: Option<u64>, pub feed_timeout: Option<u64>,
pub num_cpus: Option<usize>,
} }
impl Default for CoreOpts { impl Default for CoreOpts {
fn default() -> Self { fn default() -> Self {
Self { feed_timeout: None } Self {
feed_timeout: None,
num_cpus: None
}
} }
} }
@@ -33,6 +37,7 @@ pub struct ShardOpts {
pub max_nodes_per_connection: Option<usize>, pub max_nodes_per_connection: Option<usize>,
pub max_node_data_per_second: Option<usize>, pub max_node_data_per_second: Option<usize>,
pub node_block_seconds: Option<u64>, pub node_block_seconds: Option<u64>,
pub num_cpus: Option<usize>,
} }
impl Default for ShardOpts { impl Default for ShardOpts {
@@ -41,6 +46,7 @@ impl Default for ShardOpts {
max_nodes_per_connection: None, max_nodes_per_connection: None,
max_node_data_per_second: None, max_node_data_per_second: None,
node_block_seconds: None, node_block_seconds: None,
num_cpus: None
} }
} }
} }
@@ -114,6 +120,11 @@ pub async fn start_server(
.arg("--node-block-seconds") .arg("--node-block-seconds")
.arg(val.to_string()); .arg(val.to_string());
} }
if let Some(val) = shard_opts.num_cpus {
shard_command = shard_command
.arg("--num-cpus")
.arg(val.to_string());
}
// Build the core command // Build the core command
let mut core_command = std::env::var("TELEMETRY_CORE_BIN") let mut core_command = std::env::var("TELEMETRY_CORE_BIN")
@@ -127,6 +138,9 @@ pub async fn start_server(
if let Some(val) = core_opts.feed_timeout { if let Some(val) = core_opts.feed_timeout {
core_command = core_command.arg("--feed-timeout").arg(val.to_string()); core_command = core_command.arg("--feed-timeout").arg(val.to_string());
} }
if let Some(val) = core_opts.num_cpus {
core_command = core_command.arg("--num-cpus").arg(val.to_string());
}
// Star the server // Star the server
Server::start(server::StartOpts::ShardAndCore { Server::start(server::StartOpts::ShardAndCore {