diff --git a/Cargo.lock b/Cargo.lock index 905e166ca5..062da68db2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5833,6 +5833,7 @@ dependencies = [ "solana-version", "solana-vote-program", "symlink", + "sysctl", "tikv-jemallocator", ] diff --git a/multinode-demo/bootstrap-validator.sh b/multinode-demo/bootstrap-validator.sh index 13f57311f8..6625bc1f5b 100755 --- a/multinode-demo/bootstrap-validator.sh +++ b/multinode-demo/bootstrap-validator.sh @@ -124,6 +124,7 @@ args+=( --vote-account "$vote_account" --rpc-faucet-address 127.0.0.1:9900 --no-poh-speed-test + --no-os-network-limits-test --no-wait-for-vote-to-start-leader ) default_arg --gossip-port 8001 diff --git a/multinode-demo/validator.sh b/multinode-demo/validator.sh index 239e7c8c23..cd76b5d3d4 100755 --- a/multinode-demo/validator.sh +++ b/multinode-demo/validator.sh @@ -9,6 +9,7 @@ source "$here"/common.sh args=( --max-genesis-archive-unpacked-size 1073741824 --no-poh-speed-test + --no-os-network-limits-test ) airdrops_enabled=1 node_sol=500 # 500 SOL: number of SOL to airdrop the node for transaction fees and vote account rent exemption (ignored if airdrops_enabled=0) diff --git a/validator/Cargo.toml b/validator/Cargo.toml index 268eee4da1..5b58913cbe 100644 --- a/validator/Cargo.toml +++ b/validator/Cargo.toml @@ -56,6 +56,7 @@ jemallocator = {package = "tikv-jemallocator", version = "0.4.1", features = ["u [target."cfg(unix)".dependencies] libc = "0.2.104" signal-hook = "0.2.3" +sysctl = "0.4.2" [package.metadata.docs.rs] targets = ["x86_64-unknown-linux-gnu"] diff --git a/validator/src/main.rs b/validator/src/main.rs index ad63017ad7..c63a4a0db9 100644 --- a/validator/src/main.rs +++ b/validator/src/main.rs @@ -30,6 +30,7 @@ use { contact_info::ContactInfo, }, solana_ledger::blockstore_db::BlockstoreRecoveryMode, + solana_metrics::datapoint_info, solana_perf::recycler::enable_recycler_warming, solana_poh::poh_service, solana_replica_lib::accountsdb_repl_server::AccountsDbReplServiceConfig, @@ -411,6 +412,87 @@ fn get_cluster_shred_version(entrypoints: &[SocketAddr]) -> Option { None } +fn platform_id() -> String { + format!( + "{}/{}/{}", + std::env::consts::FAMILY, + std::env::consts::OS, + std::env::consts::ARCH + ) +} + +#[cfg(target_os = "linux")] +fn check_os_network_limits() { + use solana_metrics::datapoint_warn; + use std::collections::HashMap; + use sysctl::Sysctl; + + fn sysctl_read(name: &str) -> Result { + let ctl = sysctl::Ctl::new(name)?; + let val = ctl.value_string()?; + Ok(val) + } + let mut check_failed = false; + + info!("Testing OS network limits:"); + + // Reference: https://medium.com/@CameronSparr/increase-os-udp-buffers-to-improve-performance-51d167bb1360 + let mut recommended_limits: HashMap<&str, i64> = HashMap::default(); + recommended_limits.insert("net.core.rmem_max", 134217728); + recommended_limits.insert("net.core.rmem_default", 134217728); + recommended_limits.insert("net.core.wmem_max", 134217728); + recommended_limits.insert("net.core.wmem_default", 134217728); + recommended_limits.insert("vm.max_map_count", 1000000); + + // Additionally collect the following limits + recommended_limits.insert("net.core.optmem_max", 0); + recommended_limits.insert("net.core.netdev_max_backlog", 0); + + let mut current_limits: HashMap<&str, i64> = HashMap::default(); + for (key, _) in recommended_limits.iter() { + let current_val = match sysctl_read(key) { + Ok(val) => val.parse::().unwrap(), + Err(e) => { + error!("Failed to query value for {}: {}", key, e); + check_failed = true; + -1 + } + }; + current_limits.insert(key, current_val); + } + + for (key, recommended_val) in recommended_limits.iter() { + let current_val = *current_limits.get(key).unwrap(); + if current_val < *recommended_val { + datapoint_warn!("os-config", (key, current_val, i64)); + warn!( + " {}: recommended={} current={}, too small", + key, recommended_val, current_val + ); + check_failed = true; + } else { + datapoint_info!("os-config", (key, current_val, i64)); + info!( + " {}: recommended={} current={}", + key, recommended_val, current_val + ); + } + } + datapoint_info!("os-config", ("platform", platform_id(), String)); + + if check_failed { + datapoint_warn!("os-config", ("network_limit_test_failed", 1, i64)); + warn!("OS network limit test failed. solana-sys-tuner may be used to configure OS network limits. Bypass check with --no-os-network-limits-test."); + } else { + info!("OS network limits test passed."); + } +} + +#[cfg(not(target_os = "linux"))] +fn check_os_network_limits() { + datapoint_info!("os-config", ("platform", platform_id(), String)); +} + pub fn main() { let default_dynamic_port_range = &format!("{}-{}", VALIDATOR_PORT_RANGE.0, VALIDATOR_PORT_RANGE.1); @@ -867,6 +949,12 @@ pub fn main() { .long("no-poh-speed-test") .help("Skip the check for PoH speed."), ) + .arg( + Arg::with_name("no_os_network_limits_test") + .hidden(true) + .long("no-os-network-limits-test") + .help("Skip checks for OS network limits.") + ) .arg( Arg::with_name("accounts-hash-interval-slots") .long("accounts-hash-interval-slots") @@ -2345,6 +2433,10 @@ pub fn main() { }) }); + if !matches.is_present("no_os_network_limits_test") { + check_os_network_limits(); + } + let mut ledger_lock = ledger_lockfile(&ledger_path); let _ledger_write_guard = lock_ledger(&ledger_path, &mut ledger_lock);