diff --git a/watchtower/README.md b/watchtower/README.md index 1779eb2528..fb19250769 100644 --- a/watchtower/README.md +++ b/watchtower/README.md @@ -3,6 +3,10 @@ periodically polls the cluster over an RPC API to confirm that the transaction count is advancing, new blockhashes are available, and no validators are delinquent. Results are reported as InfluxDB metrics. +If you only care about the health of one specific validator, the +`--validator-identity` command-line argument can be used to restrict failure +notifications to issues only affecting that validator. + ### Metrics #### `watchtower-sanity` On every iteration this data point will be emitted indicating the overall result diff --git a/watchtower/src/main.rs b/watchtower/src/main.rs index 81aca44669..9bd91c4216 100644 --- a/watchtower/src/main.rs +++ b/watchtower/src/main.rs @@ -2,7 +2,10 @@ use clap::{crate_description, crate_name, value_t_or_exit, App, Arg}; use log::*; -use solana_clap_utils::input_validators::is_url; +use solana_clap_utils::{ + input_parsers::pubkey_of, + input_validators::{is_pubkey_or_keypair, is_url}, +}; use solana_client::rpc_client::RpcClient; use solana_metrics::{datapoint_error, datapoint_info}; use std::{error, io, thread::sleep, time::Duration}; @@ -28,10 +31,19 @@ fn main() -> Result<(), Box> { .default_value("60") .help("Wait interval seconds between checking the cluster"), ) + .arg( + Arg::with_name("validator_identity") + .long("validator-identity") + .value_name("VALIDATOR IDENTITY PUBKEY") + .takes_value(true) + .validator(is_pubkey_or_keypair) + .help("Monitor a specific validator only instead of the entire cluster"), + ) .get_matches(); let interval = Duration::from_secs(value_t_or_exit!(matches, "interval", u64)); let json_rpc_url = value_t_or_exit!(matches, "json_rpc_url", String); + let validator_identity = pubkey_of(&matches, "validator_identity").map(|i| i.to_string()); solana_logger::setup_with_filter("solana=info"); solana_metrics::set_panic_hook("watchtower"); @@ -92,13 +104,44 @@ fn main() -> Result<(), Box> { "Delinquent validator count: {}", vote_accounts.delinquent.len() ); - if vote_accounts.delinquent.is_empty() { - Ok(true) - } else { - Err(io::Error::new( - io::ErrorKind::Other, - format!("{} delinquent validators", vote_accounts.delinquent.len()), - )) + + match validator_identity.as_ref() { + Some(validator_identity) => { + if vote_accounts + .current + .iter() + .any(|vai| vai.node_pubkey == *validator_identity) + { + Ok(true) + } else if vote_accounts + .delinquent + .iter() + .any(|vai| vai.node_pubkey == *validator_identity) + { + Err(io::Error::new( + io::ErrorKind::Other, + format!("Validator {} is delinquent", validator_identity), + )) + } else { + Err(io::Error::new( + io::ErrorKind::Other, + format!("Validator {} is missing", validator_identity), + )) + } + } + None => { + if vote_accounts.delinquent.is_empty() { + Ok(true) + } else { + Err(io::Error::new( + io::ErrorKind::Other, + format!( + "{} delinquent validators", + vote_accounts.delinquent.len() + ), + )) + } + } } }) .unwrap_or_else(|err| {