Add 'unknown' health check state

This commit is contained in:
Michael Vines
2021-03-04 21:18:08 -08:00
committed by mergify[bot]
parent ee621878b0
commit 66b781eec3
5 changed files with 74 additions and 58 deletions

View File

@ -1866,6 +1866,10 @@ pub mod rpc_minimal {
fn get_health(&self, meta: Self::Metadata) -> Result<String> { fn get_health(&self, meta: Self::Metadata) -> Result<String> {
match meta.health.check() { match meta.health.check() {
RpcHealthStatus::Ok => Ok("ok".to_string()), RpcHealthStatus::Ok => Ok("ok".to_string()),
RpcHealthStatus::Unknown => Err(RpcCustomError::NodeUnhealthy {
num_slots_behind: None,
}
.into()),
RpcHealthStatus::Behind { num_slots } => Err(RpcCustomError::NodeUnhealthy { RpcHealthStatus::Behind { num_slots } => Err(RpcCustomError::NodeUnhealthy {
num_slots_behind: Some(num_slots), num_slots_behind: Some(num_slots),
} }
@ -2700,6 +2704,12 @@ pub mod rpc_full {
match meta.health.check() { match meta.health.check() {
RpcHealthStatus::Ok => (), RpcHealthStatus::Ok => (),
RpcHealthStatus::Unknown => {
return Err(RpcCustomError::NodeUnhealthy {
num_slots_behind: None,
}
.into());
}
RpcHealthStatus::Behind { num_slots } => { RpcHealthStatus::Behind { num_slots } => {
return Err(RpcCustomError::NodeUnhealthy { return Err(RpcCustomError::NodeUnhealthy {
num_slots_behind: Some(num_slots), num_slots_behind: Some(num_slots),

View File

@ -8,10 +8,11 @@ use {
}, },
}; };
#[derive(PartialEq, Clone, Copy)] #[derive(PartialEq, Clone, Copy, Debug)]
pub enum RpcHealthStatus { pub enum RpcHealthStatus {
Ok, Ok,
Behind { num_slots: Slot }, // Validator is behind its trusted validators Behind { num_slots: Slot }, // Validator is behind its trusted validators
Unknown,
} }
pub struct RpcHealth { pub struct RpcHealth {
@ -51,8 +52,7 @@ impl RpcHealth {
if self.override_health_check.load(Ordering::Relaxed) { if self.override_health_check.load(Ordering::Relaxed) {
RpcHealthStatus::Ok RpcHealthStatus::Ok
} else if let Some(trusted_validators) = &self.trusted_validators { } else if let Some(trusted_validators) = &self.trusted_validators {
let (latest_account_hash_slot, latest_trusted_validator_account_hash_slot) = { match (
(
self.cluster_info self.cluster_info
.get_accounts_hash_for_node(&self.cluster_info.id(), |hashes| { .get_accounts_hash_for_node(&self.cluster_info.id(), |hashes| {
hashes hashes
@ -60,11 +60,10 @@ impl RpcHealth {
.max_by(|a, b| a.0.cmp(&b.0)) .max_by(|a, b| a.0.cmp(&b.0))
.map(|slot_hash| slot_hash.0) .map(|slot_hash| slot_hash.0)
}) })
.flatten() .flatten(),
.unwrap_or(0),
trusted_validators trusted_validators
.iter() .iter()
.map(|trusted_validator| { .filter_map(|trusted_validator| {
self.cluster_info self.cluster_info
.get_accounts_hash_for_node(&trusted_validator, |hashes| { .get_accounts_hash_for_node(&trusted_validator, |hashes| {
hashes hashes
@ -73,18 +72,16 @@ impl RpcHealth {
.map(|slot_hash| slot_hash.0) .map(|slot_hash| slot_hash.0)
}) })
.flatten() .flatten()
.unwrap_or(0)
}) })
.max() .max(),
.unwrap_or(0), ) {
) (
}; Some(latest_account_hash_slot),
Some(latest_trusted_validator_account_hash_slot),
// This validator is considered healthy if its latest account hash slot is within ) => {
// The validator is considered healthy if its latest account hash slot is within
// `health_check_slot_distance` of the latest trusted validator's account hash slot // `health_check_slot_distance` of the latest trusted validator's account hash slot
if latest_account_hash_slot > 0 if latest_account_hash_slot
&& latest_trusted_validator_account_hash_slot > 0
&& latest_account_hash_slot
> latest_trusted_validator_account_hash_slot > latest_trusted_validator_account_hash_slot
.saturating_sub(self.health_check_slot_distance) .saturating_sub(self.health_check_slot_distance)
{ {
@ -94,10 +91,15 @@ impl RpcHealth {
.saturating_sub(latest_account_hash_slot); .saturating_sub(latest_account_hash_slot);
warn!( warn!(
"health check: behind by {} slots: me={}, latest trusted_validator={}", "health check: behind by {} slots: me={}, latest trusted_validator={}",
num_slots, latest_account_hash_slot, latest_trusted_validator_account_hash_slot num_slots,
latest_account_hash_slot,
latest_trusted_validator_account_hash_slot
); );
RpcHealthStatus::Behind { num_slots } RpcHealthStatus::Behind { num_slots }
} }
}
_ => RpcHealthStatus::Unknown,
}
} else { } else {
// No trusted validator point of reference available, so this validator is healthy // No trusted validator point of reference available, so this validator is healthy
// because it's running // because it's running

View File

@ -178,7 +178,8 @@ impl RpcRequestMiddleware {
fn health_check(&self) -> &'static str { fn health_check(&self) -> &'static str {
let response = match self.health.check() { let response = match self.health.check() {
RpcHealthStatus::Ok => "ok", RpcHealthStatus::Ok => "ok",
RpcHealthStatus::Behind { num_slots: _ } => "behind", RpcHealthStatus::Behind { .. } => "behind",
RpcHealthStatus::Unknown => "unknown",
}; };
info!("health check: {}", response); info!("health check: {}", response);
response response
@ -696,18 +697,20 @@ mod tests {
let rm = RpcRequestMiddleware::new(PathBuf::from("/"), None, create_bank_forks(), health); let rm = RpcRequestMiddleware::new(PathBuf::from("/"), None, create_bank_forks(), health);
// No account hashes for this node or any trusted validators == "behind" // No account hashes for this node or any trusted validators
assert_eq!(rm.health_check(), "behind"); assert_eq!(rm.health_check(), "unknown");
// No account hashes for any trusted validators == "behind" // No account hashes for any trusted validators
cluster_info.push_accounts_hashes(vec![(1000, Hash::default()), (900, Hash::default())]); cluster_info.push_accounts_hashes(vec![(1000, Hash::default()), (900, Hash::default())]);
cluster_info.flush_push_queue(); cluster_info.flush_push_queue();
assert_eq!(rm.health_check(), "behind"); assert_eq!(rm.health_check(), "unknown");
// Override health check
override_health_check.store(true, Ordering::Relaxed); override_health_check.store(true, Ordering::Relaxed);
assert_eq!(rm.health_check(), "ok"); assert_eq!(rm.health_check(), "ok");
override_health_check.store(false, Ordering::Relaxed); override_health_check.store(false, Ordering::Relaxed);
// This node is ahead of the trusted validators == "ok" // This node is ahead of the trusted validators
cluster_info cluster_info
.gossip .gossip
.write() .write()
@ -727,7 +730,7 @@ mod tests {
.unwrap(); .unwrap();
assert_eq!(rm.health_check(), "ok"); assert_eq!(rm.health_check(), "ok");
// Node is slightly behind the trusted validators == "ok" // Node is slightly behind the trusted validators
cluster_info cluster_info
.gossip .gossip
.write() .write()
@ -743,7 +746,7 @@ mod tests {
.unwrap(); .unwrap();
assert_eq!(rm.health_check(), "ok"); assert_eq!(rm.health_check(), "ok");
// Node is far behind the trusted validators == "behind" // Node is far behind the trusted validators
cluster_info cluster_info
.gossip .gossip
.write() .write()

View File

@ -187,11 +187,12 @@ Many methods that take a commitment parameter return an RpcResponse JSON object
Although not a JSON RPC API, a `GET /health` at the RPC HTTP Endpoint provides a Although not a JSON RPC API, a `GET /health` at the RPC HTTP Endpoint provides a
health-check mechanism for use by load balancers or other network health-check mechanism for use by load balancers or other network
infrastructure. This request will always return a HTTP 200 OK response with a body of infrastructure. This request will always return a HTTP 200 OK response with a body of
"ok" or "behind" based on the following conditions: "ok", "behind" or "unknown" based on the following conditions:
1. If one or more `--trusted-validator` arguments are provided to `solana-validator`, "ok" is returned 1. If one or more `--trusted-validator` arguments are provided to `solana-validator`, "ok" is returned
when the node has within `HEALTH_CHECK_SLOT_DISTANCE` slots of the highest trusted validator, when the node has within `HEALTH_CHECK_SLOT_DISTANCE` slots of the highest
otherwise "behind" is returned. trusted validator, otherwise "behind". "unknown" is returned when no slot
information from trusted validators is not yet available.
2. "ok" is always returned if no trusted validators are provided. 2. "ok" is always returned if no trusted validators are provided.
## JSON RPC API Reference ## JSON RPC API Reference

View File

@ -266,7 +266,7 @@ fn get_validator_stats(
{ {
format!("{} slots behind", num_slots_behind) format!("{} slots behind", num_slots_behind)
} else { } else {
"unhealthy".to_string() "health unknown".to_string()
} }
} }
}; };