From 5205eb382efc2f91b82a56c9a635222aaa2792d6 Mon Sep 17 00:00:00 2001 From: "mergify[bot]" <37929162+mergify[bot]@users.noreply.github.com> Date: Fri, 10 Jul 2020 16:15:36 -0700 Subject: [PATCH] Switch to using weighted repair in RepairService (#10735) (#10985) Co-authored-by: Carl --- core/src/cluster_info_vote_listener.rs | 101 +- core/src/repair_service.rs | 356 +-- core/src/repair_weight.rs | 10 +- core/src/retransmit_stage.rs | 6 +- core/src/rpc_pubsub.rs | 2 + core/src/tpu.rs | 10 +- core/src/tvu.rs | 7 +- core/src/validator.rs | 3 + core/src/verified_vote_packets.rs | 4 +- core/src/window_service.rs | 6 +- .../dashboards/cluster-monitor.json | 2323 ++++++++++++++++- 11 files changed, 2472 insertions(+), 356 deletions(-) diff --git a/core/src/cluster_info_vote_listener.rs b/core/src/cluster_info_vote_listener.rs index 0630bb2090..15494c5022 100644 --- a/core/src/cluster_info_vote_listener.rs +++ b/core/src/cluster_info_vote_listener.rs @@ -28,7 +28,7 @@ use solana_sdk::{ pubkey::Pubkey, transaction::Transaction, }; -use solana_vote_program::vote_instruction::VoteInstruction; +use solana_vote_program::{vote_instruction::VoteInstruction, vote_state::Vote}; use std::{ collections::{HashMap, HashSet}, sync::{ @@ -40,16 +40,18 @@ use std::{ }; // Map from a vote account to the authorized voter for an epoch -pub type VerifiedVotePacketsSender = CrossbeamSender>; -pub type VerifiedVotePacketsReceiver = CrossbeamReceiver>; +pub type VerifiedLabelVotePacketsSender = CrossbeamSender>; +pub type VerifiedLabelVotePacketsReceiver = CrossbeamReceiver>; pub type VerifiedVoteTransactionsSender = CrossbeamSender>; pub type VerifiedVoteTransactionsReceiver = CrossbeamReceiver>; +pub type VerifiedVoteSender = CrossbeamSender<(Pubkey, Vote)>; +pub type VerifiedVoteReceiver = CrossbeamReceiver<(Pubkey, Vote)>; #[derive(Default)] pub struct SlotVoteTracker { voted: HashSet>, updates: Option>>, - pub total_stake: u64, + total_stake: u64, } impl SlotVoteTracker { @@ -62,7 +64,7 @@ impl SlotVoteTracker { #[derive(Default)] pub struct VoteTracker { // Map from a slot to a set of validators who have voted for that slot - pub slot_vote_trackers: RwLock>>>, + slot_vote_trackers: RwLock>>>, // Don't track votes from people who are not staked, acts as a spam filter epoch_authorized_voters: RwLock>>, leader_schedule_epoch: RwLock, @@ -202,15 +204,17 @@ impl ClusterInfoVoteListener { pub fn new( exit: &Arc, cluster_info: Arc, - sender: CrossbeamSender>, + verified_packets_sender: CrossbeamSender>, poh_recorder: &Arc>, vote_tracker: Arc, bank_forks: Arc>, subscriptions: Arc, + verified_vote_sender: VerifiedVoteSender, ) -> Self { let exit_ = exit.clone(); - let (verified_vote_packets_sender, verified_vote_packets_receiver) = unbounded(); + let (verified_vote_label_packets_sender, verified_vote_label_packets_receiver) = + unbounded(); let (verified_vote_transactions_sender, verified_vote_transactions_receiver) = unbounded(); let listen_thread = Builder::new() .name("solana-cluster_info_vote_listener".to_string()) @@ -218,7 +222,7 @@ impl ClusterInfoVoteListener { let _ = Self::recv_loop( exit_, &cluster_info, - verified_vote_packets_sender, + verified_vote_label_packets_sender, verified_vote_transactions_sender, ); }) @@ -231,9 +235,9 @@ impl ClusterInfoVoteListener { .spawn(move || { let _ = Self::bank_send_loop( exit_, - verified_vote_packets_receiver, + verified_vote_label_packets_receiver, poh_recorder, - &sender, + &verified_packets_sender, ); }) .unwrap(); @@ -248,6 +252,7 @@ impl ClusterInfoVoteListener { vote_tracker, &bank_forks, subscriptions, + verified_vote_sender, ); }) .unwrap(); @@ -267,7 +272,7 @@ impl ClusterInfoVoteListener { fn recv_loop( exit: Arc, cluster_info: &ClusterInfo, - verified_vote_packets_sender: VerifiedVotePacketsSender, + verified_vote_label_packets_sender: VerifiedLabelVotePacketsSender, verified_vote_transactions_sender: VerifiedVoteTransactionsSender, ) -> Result<()> { let mut last_ts = 0; @@ -282,7 +287,7 @@ impl ClusterInfoVoteListener { if !votes.is_empty() { let (vote_txs, packets) = Self::verify_votes(votes, labels); verified_vote_transactions_sender.send(vote_txs)?; - verified_vote_packets_sender.send(packets)?; + verified_vote_label_packets_sender.send(packets)?; } sleep(Duration::from_millis(GOSSIP_SLEEP_MILLIS)); @@ -322,9 +327,9 @@ impl ClusterInfoVoteListener { fn bank_send_loop( exit: Arc, - verified_vote_packets_receiver: VerifiedVotePacketsReceiver, + verified_vote_label_packets_receiver: VerifiedLabelVotePacketsReceiver, poh_recorder: Arc>, - packets_sender: &CrossbeamSender>, + verified_packets_sender: &CrossbeamSender>, ) -> Result<()> { let mut verified_vote_packets = VerifiedVotePackets::default(); let mut time_since_lock = Instant::now(); @@ -334,9 +339,10 @@ impl ClusterInfoVoteListener { return Ok(()); } - if let Err(e) = verified_vote_packets - .get_and_process_vote_packets(&verified_vote_packets_receiver, &mut update_version) - { + if let Err(e) = verified_vote_packets.get_and_process_vote_packets( + &verified_vote_label_packets_receiver, + &mut update_version, + ) { match e { Error::CrossbeamRecvTimeoutError(RecvTimeoutError::Disconnected) => { return Ok(()); @@ -353,7 +359,7 @@ impl ClusterInfoVoteListener { if let Some(bank) = bank { let last_version = bank.last_vote_sync.load(Ordering::Relaxed); let (new_version, msgs) = verified_vote_packets.get_latest_votes(last_version); - packets_sender.send(msgs)?; + verified_packets_sender.send(msgs)?; bank.last_vote_sync.compare_and_swap( last_version, new_version, @@ -371,6 +377,7 @@ impl ClusterInfoVoteListener { vote_tracker: Arc, bank_forks: &RwLock, subscriptions: Arc, + verified_vote_sender: VerifiedVoteSender, ) -> Result<()> { loop { if exit.load(Ordering::Relaxed) { @@ -387,6 +394,7 @@ impl ClusterInfoVoteListener { root_bank.slot(), subscriptions.clone(), epoch_stakes, + &verified_vote_sender, ) { match e { Error::CrossbeamRecvTimeoutError(RecvTimeoutError::Disconnected) => { @@ -407,6 +415,7 @@ impl ClusterInfoVoteListener { vote_tracker: &Arc, last_root: Slot, subscriptions: Arc, + verified_vote_sender: &VerifiedVoteSender, ) -> Result<()> { Self::get_and_process_votes( vote_txs_receiver, @@ -414,6 +423,7 @@ impl ClusterInfoVoteListener { last_root, subscriptions, None, + verified_vote_sender, ) } @@ -423,6 +433,7 @@ impl ClusterInfoVoteListener { last_root: Slot, subscriptions: Arc, epoch_stakes: Option<&EpochStakes>, + verified_vote_sender: &VerifiedVoteSender, ) -> Result<()> { let timer = Duration::from_millis(200); let mut vote_txs = vote_txs_receiver.recv_timeout(timer)?; @@ -435,6 +446,7 @@ impl ClusterInfoVoteListener { last_root, subscriptions, epoch_stakes, + verified_vote_sender, ); Ok(()) } @@ -445,6 +457,7 @@ impl ClusterInfoVoteListener { root: Slot, subscriptions: Arc, epoch_stakes: Option<&EpochStakes>, + verified_vote_sender: &VerifiedVoteSender, ) { let mut diff: HashMap>> = HashMap::new(); { @@ -516,6 +529,7 @@ impl ClusterInfoVoteListener { } subscriptions.notify_vote(&vote); + let _ = verified_vote_sender.send((*vote_pubkey, vote)); } } } @@ -783,6 +797,7 @@ mod tests { // Create some voters at genesis let (vote_tracker, _, validator_voting_keypairs, subscriptions) = setup(); let (votes_sender, votes_receiver) = unbounded(); + let (verified_vote_sender, verified_vote_receiver) = unbounded(); let vote_slots = vec![1, 2]; validator_voting_keypairs.iter().for_each(|keypairs| { @@ -806,8 +821,20 @@ mod tests { 0, subscriptions, None, + &verified_vote_sender, ) .unwrap(); + + // Check that the received votes were pushed to other commponents + // subscribing via a channel + let received_votes: Vec<_> = verified_vote_receiver.try_iter().collect(); + assert_eq!(received_votes.len(), validator_voting_keypairs.len()); + for (voting_keypair, (received_pubkey, received_vote)) in + validator_voting_keypairs.iter().zip(received_votes.iter()) + { + assert_eq!(voting_keypair.vote_keypair.pubkey(), *received_pubkey); + assert_eq!(received_vote.slots, vote_slots); + } for vote_slot in vote_slots { let slot_vote_tracker = vote_tracker.get_slot_vote_tracker(vote_slot).unwrap(); let r_slot_vote_tracker = slot_vote_tracker.read().unwrap(); @@ -828,14 +855,17 @@ mod tests { // Create some voters at genesis let (vote_tracker, _, validator_voting_keypairs, subscriptions) = setup(); // Send some votes to process - let (votes_sender, votes_receiver) = unbounded(); + let (votes_txs_sender, votes_txs_receiver) = unbounded(); + let (verified_vote_sender, verified_vote_receiver) = unbounded(); + let mut expected_votes = vec![]; for (i, keyset) in validator_voting_keypairs.chunks(2).enumerate() { let validator_votes: Vec<_> = keyset .iter() .map(|keypairs| { let node_keypair = &keypairs.node_keypair; let vote_keypair = &keypairs.vote_keypair; + expected_votes.push((vote_keypair.pubkey(), vec![i as Slot + 1])); vote_transaction::new_vote_transaction( vec![i as u64 + 1], Hash::default(), @@ -846,18 +876,34 @@ mod tests { ) }) .collect(); - votes_sender.send(validator_votes).unwrap(); + votes_txs_sender.send(validator_votes).unwrap(); } - // Check that all the votes were registered for each validator correctly + // Read and process votes from channel `votes_receiver` ClusterInfoVoteListener::get_and_process_votes( - &votes_receiver, + &votes_txs_receiver, &vote_tracker, 0, subscriptions, None, + &verified_vote_sender, ) .unwrap(); + + // Check that the received votes were pushed to other commponents + // subscribing via a channel + let received_votes: Vec<_> = verified_vote_receiver + .try_iter() + .map(|(pubkey, vote)| (pubkey, vote.slots)) + .collect(); + assert_eq!(received_votes.len(), validator_voting_keypairs.len()); + for (expected_pubkey_vote, received_pubkey_vote) in + expected_votes.iter().zip(received_votes.iter()) + { + assert_eq!(expected_pubkey_vote, received_pubkey_vote); + } + + // Check that all the votes were registered for each validator correctly for (i, keyset) in validator_voting_keypairs.chunks(2).enumerate() { let slot_vote_tracker = vote_tracker.get_slot_vote_tracker(i as u64 + 1).unwrap(); let r_slot_vote_tracker = &slot_vote_tracker.read().unwrap(); @@ -974,12 +1020,14 @@ mod tests { &validator0_keypairs.vote_keypair, )]; + let (verified_vote_sender, _verified_vote_receiver) = unbounded(); ClusterInfoVoteListener::process_votes( &vote_tracker, vote_tx, 0, subscriptions.clone(), None, + &verified_vote_sender, ); let ref_count = Arc::strong_count( &vote_tracker @@ -1031,7 +1079,14 @@ mod tests { }) .collect(); - ClusterInfoVoteListener::process_votes(&vote_tracker, vote_txs, 0, subscriptions, None); + ClusterInfoVoteListener::process_votes( + &vote_tracker, + vote_txs, + 0, + subscriptions, + None, + &verified_vote_sender, + ); let ref_count = Arc::strong_count( &vote_tracker diff --git a/core/src/repair_service.rs b/core/src/repair_service.rs index 3538004472..24a418af75 100644 --- a/core/src/repair_service.rs +++ b/core/src/repair_service.rs @@ -2,17 +2,15 @@ //! regularly finds missing shreds in the ledger and sends repair requests for those shreds use crate::{ cluster_info::ClusterInfo, - cluster_info_vote_listener::VoteTracker, + cluster_info_vote_listener::VerifiedVoteReceiver, cluster_slots::ClusterSlots, commitment::VOTE_THRESHOLD_SIZE, + repair_weight::RepairWeight, repair_weighted_traversal::Contains, result::Result, serve_repair::{RepairType, ServeRepair, DEFAULT_NONCE}, }; use crossbeam_channel::{Receiver as CrossbeamReceiver, Sender as CrossbeamSender}; -use rand::distributions::{Distribution, WeightedIndex}; -use rand::{thread_rng, Rng, SeedableRng}; -use rand_chacha::ChaChaRng; use solana_ledger::{ bank_forks::BankForks, blockstore::{Blockstore, CompletedSlotsReceiver, SlotMeta}, @@ -79,23 +77,31 @@ pub struct RepairStats { #[derive(Default, Debug)] pub struct RepairTiming { + pub set_root_elapsed: u64, + pub get_votes_elapsed: u64, + pub add_votes_elapsed: u64, pub lowest_slot_elapsed: u64, pub update_completed_slots_elapsed: u64, - pub generate_repairs_elapsed: u64, + pub get_best_orphans_elapsed: u64, + pub get_best_shreds_elapsed: u64, pub send_repairs_elapsed: u64, } impl RepairTiming { fn update( &mut self, + set_root_elapsed: u64, + get_votes_elapsed: u64, + add_votes_elapsed: u64, lowest_slot_elapsed: u64, update_completed_slots_elapsed: u64, - generate_repairs_elapsed: u64, send_repairs_elapsed: u64, ) { + self.set_root_elapsed += set_root_elapsed; + self.get_votes_elapsed += get_votes_elapsed; + self.add_votes_elapsed += add_votes_elapsed; self.lowest_slot_elapsed += lowest_slot_elapsed; self.update_completed_slots_elapsed += update_completed_slots_elapsed; - self.generate_repairs_elapsed += generate_repairs_elapsed; self.send_repairs_elapsed += send_repairs_elapsed; } } @@ -145,7 +151,7 @@ impl RepairService { cluster_info: Arc, repair_info: RepairInfo, cluster_slots: Arc, - vote_tracker: Arc, + verified_vote_receiver: VerifiedVoteReceiver, ) -> Self { let t_repair = Builder::new() .name("solana-repair-service".to_string()) @@ -157,7 +163,7 @@ impl RepairService { cluster_info, repair_info, &cluster_slots, - vote_tracker, + verified_vote_receiver, ) }) .unwrap(); @@ -172,15 +178,17 @@ impl RepairService { cluster_info: Arc, repair_info: RepairInfo, cluster_slots: &ClusterSlots, - vote_tracker: Arc, + verified_vote_receiver: VerifiedVoteReceiver, ) { + let mut repair_weight = RepairWeight::new(repair_info.bank_forks.read().unwrap().root()); let serve_repair = ServeRepair::new(cluster_info.clone()); let id = cluster_info.id(); Self::initialize_lowest_slot(id, blockstore, &cluster_info); let mut repair_stats = RepairStats::default(); let mut repair_timing = RepairTiming::default(); let mut last_stats = Instant::now(); - let duplicate_slot_repair_statuses = HashMap::new(); + let duplicate_slot_repair_statuses: HashMap = + HashMap::new(); Self::initialize_epoch_slots( blockstore, @@ -192,12 +200,44 @@ impl RepairService { break; } + let mut set_root_elapsed; + let mut get_votes_elapsed; + let mut add_votes_elapsed; let mut lowest_slot_elapsed; let mut update_completed_slots_elapsed; - let mut generate_repairs_elapsed; let repairs = { let root_bank = repair_info.bank_forks.read().unwrap().root_bank().clone(); let new_root = root_bank.slot(); + + // Purge outdated slots from the weighting heuristic + set_root_elapsed = Measure::start("set_root_elapsed"); + repair_weight.set_root(new_root); + set_root_elapsed.stop(); + + // Add new votes to the weighting heuristic + get_votes_elapsed = Measure::start("get_votes_elapsed"); + let mut slot_to_vote_pubkeys: HashMap> = HashMap::new(); + verified_vote_receiver + .try_iter() + .for_each(|(vote_pubkey, vote)| { + for slot in vote.slots { + slot_to_vote_pubkeys + .entry(slot) + .or_default() + .push(vote_pubkey); + } + }); + get_votes_elapsed.stop(); + + add_votes_elapsed = Measure::start("add_votes"); + repair_weight.add_votes( + &blockstore, + slot_to_vote_pubkeys.into_iter(), + root_bank.epoch_stakes_map(), + root_bank.epoch_schedule(), + ); + add_votes_elapsed.stop(); + lowest_slot_elapsed = Measure::start("lowest_slot_elapsed"); let lowest_slot = blockstore.lowest_slot(); Self::update_lowest_slot(&id, lowest_slot, &cluster_info); @@ -230,40 +270,39 @@ impl RepairService { &repair_socket, );*/ - generate_repairs_elapsed = Measure::start("generate_repairs_elapsed"); - let repairs = Self::generate_repairs( + repair_weight.get_best_weighted_repairs( blockstore, - root_bank.slot(), + root_bank.epoch_stakes_map(), + root_bank.epoch_schedule(), + MAX_ORPHANS, MAX_REPAIR_LENGTH, &duplicate_slot_repair_statuses, - &vote_tracker, - ); - generate_repairs_elapsed.stop(); - repairs + Some(&mut repair_timing), + ) }; + let mut cache = HashMap::new(); let mut send_repairs_elapsed = Measure::start("send_repairs_elapsed"); - if let Ok(repairs) = repairs { - let mut cache = HashMap::new(); - repairs.into_iter().for_each(|repair_request| { - if let Ok((to, req)) = serve_repair.repair_request( - &cluster_slots, - repair_request, - &mut cache, - &mut repair_stats, - ) { - repair_socket.send_to(&req, to).unwrap_or_else(|e| { - info!("{} repair req send_to({}) error {:?}", id, to, e); - 0 - }); - } - }); - } + repairs.into_iter().for_each(|repair_request| { + if let Ok((to, req)) = serve_repair.repair_request( + &cluster_slots, + repair_request, + &mut cache, + &mut repair_stats, + ) { + repair_socket.send_to(&req, to).unwrap_or_else(|e| { + info!("{} repair req send_to({}) error {:?}", id, to, e); + 0 + }); + } + }); send_repairs_elapsed.stop(); repair_timing.update( + set_root_elapsed.as_us(), + get_votes_elapsed.as_us(), + add_votes_elapsed.as_us(), lowest_slot_elapsed.as_us(), update_completed_slots_elapsed.as_us(), - generate_repairs_elapsed.as_us(), send_repairs_elapsed.as_us(), ); @@ -285,23 +324,31 @@ impl RepairService { } datapoint_info!( "serve_repair-repair-timing", + ("set-root-elapsed", repair_timing.set_root_elapsed, i64), + ("get-votes-elapsed", repair_timing.get_votes_elapsed, i64), + ("add-votes-elapsed", repair_timing.add_votes_elapsed, i64), ( - "lowest_slot_elapsed", + "get-best-orphans-elapsed", + repair_timing.get_best_orphans_elapsed, + i64 + ), + ( + "get-best-shreds-elapsed", + repair_timing.get_best_shreds_elapsed, + i64 + ), + ( + "lowest-slot-elapsed", repair_timing.lowest_slot_elapsed, i64 ), ( - "update_completed_slots_elapsed", + "update-completed-slots-elapsed", repair_timing.update_completed_slots_elapsed, i64 ), ( - "generate_repairs_elapsed", - repair_timing.generate_repairs_elapsed, - i64 - ), - ( - "send_repairs_elapsed", + "send-repairs-elapsed", repair_timing.send_repairs_elapsed, i64 ), @@ -402,31 +449,6 @@ impl RepairService { } } - fn generate_repairs( - blockstore: &Blockstore, - root: Slot, - max_repairs: usize, - duplicate_slot_repair_statuses: &HashMap, - vote_tracker: &Arc, - ) -> Result> { - // Slot height and shred indexes for shreds we want to repair - let mut repairs: Vec = vec![]; - Self::generate_repairs_by_level( - blockstore, - &mut repairs, - max_repairs, - root, - duplicate_slot_repair_statuses, - vote_tracker, - ); - - // Try to resolve orphans in blockstore - let orphans = blockstore.orphans_iterator(root + 1).unwrap(); - Self::generate_repairs_for_orphans(orphans, &mut repairs); - - Ok(repairs) - } - #[allow(dead_code)] fn generate_duplicate_repairs_for_slot( blockstore: &Blockstore, @@ -630,81 +652,6 @@ impl RepairService { .collect() } - fn generate_repairs_for_orphans( - orphans: impl Iterator, - repairs: &mut Vec, - ) { - repairs.extend(orphans.take(MAX_ORPHANS).map(RepairType::Orphan)); - } - - /// Repairs any fork starting at the input slot - fn generate_repairs_by_level( - blockstore: &Blockstore, - repairs: &mut Vec, - max_repairs: usize, - slot: Slot, - duplicate_slot_repair_statuses: &HashMap, - vote_tracker: &Arc, - ) { - let mut seed = [0u8; 32]; - thread_rng().fill(&mut seed); - let rng = &mut ChaChaRng::from_seed(seed); - let mut pending_slots = vec![slot]; - while repairs.len() < max_repairs && !pending_slots.is_empty() { - pending_slots.retain(|slot| !duplicate_slot_repair_statuses.contains_key(slot)); - let mut next_pending_slots = vec![]; - let mut level_repairs = HashMap::new(); - for slot in &pending_slots { - if let Some(slot_meta) = blockstore.meta(*slot).unwrap() { - let new_repairs = Self::generate_repairs_for_slot( - blockstore, - *slot, - &slot_meta, - std::usize::MAX, - ); - if !new_repairs.is_empty() { - level_repairs.insert(*slot, new_repairs); - } - next_pending_slots.extend(slot_meta.next_slots); - } - } - - if !level_repairs.is_empty() { - let mut slots_to_repair: Vec<_> = level_repairs.keys().cloned().collect(); - let mut weights: Vec<_> = { - let r_vote_tracker = vote_tracker.slot_vote_trackers.read().unwrap(); - slots_to_repair - .iter() - .map(|slot| { - if let Some(slot_vote_tracker) = r_vote_tracker.get(slot) { - std::cmp::max(slot_vote_tracker.read().unwrap().total_stake, 1) - } else { - // should it be something else? - 1 - } - }) - .collect() - }; - - let mut weighted_index = WeightedIndex::new(weights.clone()).unwrap(); - while repairs.len() < max_repairs && !level_repairs.is_empty() { - let index = weighted_index.sample(rng); - let slot_repairs = level_repairs.get_mut(&slots_to_repair[index]).unwrap(); - repairs.push(slot_repairs.remove(0)); - if slot_repairs.is_empty() { - level_repairs.remove(&slots_to_repair[index]); - slots_to_repair.remove(index); - weights.remove(index); - if !weights.is_empty() { - weighted_index = WeightedIndex::new(weights.clone()).unwrap(); - } - } - } - } - pending_slots = next_pending_slots; - } - } - fn initialize_lowest_slot(id: Pubkey, blockstore: &Blockstore, cluster_info: &ClusterInfo) { // Safe to set into gossip because by this time, the leader schedule cache should // also be updated with the latest root (done in blockstore_processor) and thus @@ -776,6 +723,7 @@ mod test { use solana_runtime::genesis_utils::{self, GenesisConfigInfo, ValidatorVoteKeypairs}; use solana_sdk::signature::Signer; use solana_vote_program::vote_transaction; + use std::collections::HashSet; #[test] pub fn test_repair_orphan() { @@ -788,11 +736,18 @@ mod test { let (shreds2, _) = make_slot_entries(5, 2, 1); shreds.extend(shreds2); blockstore.insert_shreds(shreds, None, false).unwrap(); - let vote_tracker = Arc::new(VoteTracker::default()); + let mut repair_weight = RepairWeight::new(0); assert_eq!( - RepairService::generate_repairs(&blockstore, 0, 2, &HashMap::new(), &vote_tracker) - .unwrap(), - vec![RepairType::HighestShred(0, 0), RepairType::Orphan(2)] + repair_weight.get_best_weighted_repairs( + &blockstore, + &HashMap::new(), + &EpochSchedule::default(), + MAX_ORPHANS, + MAX_REPAIR_LENGTH, + &HashSet::new(), + None + ), + vec![RepairType::Orphan(2), RepairType::HighestShred(0, 0)] ); } @@ -810,12 +765,19 @@ mod test { // Write this shred to slot 2, should chain to slot 0, which we haven't received // any shreds for blockstore.insert_shreds(shreds, None, false).unwrap(); + let mut repair_weight = RepairWeight::new(0); - let vote_tracker = Arc::new(VoteTracker::default()); // Check that repair tries to patch the empty slot assert_eq!( - RepairService::generate_repairs(&blockstore, 0, 2, &HashMap::new(), &vote_tracker) - .unwrap(), + repair_weight.get_best_weighted_repairs( + &blockstore, + &HashMap::new(), + &EpochSchedule::default(), + MAX_ORPHANS, + MAX_REPAIR_LENGTH, + &HashSet::new(), + None + ), vec![RepairType::HighestShred(0, 0)] ); } @@ -860,83 +822,36 @@ mod test { }) .collect(); - let vote_tracker = Arc::new(VoteTracker::default()); + let mut repair_weight = RepairWeight::new(0); assert_eq!( - RepairService::generate_repairs( + repair_weight.get_best_weighted_repairs( &blockstore, - 0, - std::usize::MAX, &HashMap::new(), - &vote_tracker - ) - .unwrap(), + &EpochSchedule::default(), + MAX_ORPHANS, + MAX_REPAIR_LENGTH, + &HashSet::new(), + None + ), expected ); assert_eq!( - RepairService::generate_repairs( + repair_weight.get_best_weighted_repairs( &blockstore, - 0, - expected.len() - 2, &HashMap::new(), - &vote_tracker, - ) - .unwrap()[..], + &EpochSchedule::default(), + MAX_ORPHANS, + expected.len() - 2, + &HashSet::new(), + None + )[..], expected[0..expected.len() - 2] ); } Blockstore::destroy(&blockstore_path).expect("Expected successful database destruction"); } - #[test] - pub fn test_repairs_distributed_across_slots() { - solana_logger::setup(); - let blockstore_path = get_tmp_ledger_path!(); - { - let blockstore = Blockstore::open(&blockstore_path).unwrap(); - - let num_entries_per_slot = 100; - - // Create some shreds - for i in 1..10 { - let (shreds, _) = make_slot_entries(i, 0, num_entries_per_slot as u64); - - // Only insert the first shred - blockstore - .insert_shreds(shreds[..1].to_vec(), None, false) - .unwrap(); - } - - let vote_tracker = Arc::new(VoteTracker::default()); - let repairs = RepairService::generate_repairs( - &blockstore, - 0, - num_entries_per_slot, - &HashMap::new(), - &vote_tracker, - ) - .unwrap(); - let mut repairs_slots = HashMap::new(); - for repair in repairs { - match repair { - RepairType::Shred(slot, _shred_index) => { - *repairs_slots.entry(slot).or_insert(0) += 1; - } - RepairType::HighestShred(slot, _shred_index) => { - *repairs_slots.entry(slot).or_insert(0) += 1; - } - RepairType::Orphan(slot) => { - *repairs_slots.entry(slot).or_insert(0) += 1; - } - } - } - for i in 1..10 { - assert!(repairs_slots.contains_key(&i)); - } - } - Blockstore::destroy(&blockstore_path).expect("Expected successful database destruction"); - } - #[test] pub fn test_generate_highest_repair() { let blockstore_path = get_tmp_ledger_path!(); @@ -958,16 +873,17 @@ mod test { let expected: Vec = vec![RepairType::HighestShred(0, num_shreds_per_slot - 1)]; - let vote_tracker = Arc::new(VoteTracker::default()); + let mut repair_weight = RepairWeight::new(0); assert_eq!( - RepairService::generate_repairs( + repair_weight.get_best_weighted_repairs( &blockstore, - 0, - std::usize::MAX, &HashMap::new(), - &vote_tracker - ) - .unwrap(), + &EpochSchedule::default(), + MAX_ORPHANS, + MAX_REPAIR_LENGTH, + &HashSet::new(), + None + ), expected ); } diff --git a/core/src/repair_weight.rs b/core/src/repair_weight.rs index 3c6fa5d4a1..639a5b463c 100644 --- a/core/src/repair_weight.rs +++ b/core/src/repair_weight.rs @@ -1,6 +1,6 @@ use crate::{ heaviest_subtree_fork_choice::HeaviestSubtreeForkChoice, - repair_service::RepairStats, + repair_service::RepairTiming, repair_weighted_traversal::{self, Contains}, serve_repair::RepairType, }; @@ -136,7 +136,7 @@ impl RepairWeight { max_new_orphans: usize, max_new_shreds: usize, ignore_slots: &dyn Contains, - repair_stats: Option<&mut RepairStats>, + repair_timing: Option<&mut RepairTiming>, ) -> Vec { let mut repairs = vec![]; let mut get_best_orphans_elapsed = Measure::start("get_best_orphans"); @@ -155,9 +155,9 @@ impl RepairWeight { self.get_best_shreds(blockstore, &mut repairs, max_new_shreds, ignore_slots); get_best_shreds_elapsed.stop(); - if let Some(repair_stats) = repair_stats { - repair_stats.get_best_orphans_us += get_best_orphans_elapsed.as_us(); - repair_stats.get_best_shreds_us += get_best_shreds_elapsed.as_us(); + if let Some(repair_timing) = repair_timing { + repair_timing.get_best_orphans_elapsed += get_best_orphans_elapsed.as_us(); + repair_timing.get_best_shreds_elapsed += get_best_shreds_elapsed.as_us(); } repairs } diff --git a/core/src/retransmit_stage.rs b/core/src/retransmit_stage.rs index fe56c87337..3412eca893 100644 --- a/core/src/retransmit_stage.rs +++ b/core/src/retransmit_stage.rs @@ -1,8 +1,8 @@ //! The `retransmit_stage` retransmits shreds between validators -use crate::cluster_info_vote_listener::VoteTracker; use crate::{ cluster_info::{compute_retransmit_peers, ClusterInfo, DATA_PLANE_FANOUT}, + cluster_info_vote_listener::VerifiedVoteReceiver, cluster_slots::ClusterSlots, contact_info::ContactInfo, repair_service::DuplicateSlotsResetSender, @@ -414,7 +414,7 @@ impl RetransmitStage { shred_version: u16, cluster_slots: Arc, duplicate_slots_reset_sender: DuplicateSlotsResetSender, - vote_tracker: Arc, + verified_vote_receiver: VerifiedVoteReceiver, ) -> Self { let (retransmit_sender, retransmit_receiver) = channel(); @@ -459,7 +459,7 @@ impl RetransmitStage { rv && is_connected }, cluster_slots, - vote_tracker, + verified_vote_receiver, ); let thread_hdls = t_retransmit; diff --git a/core/src/rpc_pubsub.rs b/core/src/rpc_pubsub.rs index c133bd11d9..6809d807a5 100644 --- a/core/src/rpc_pubsub.rs +++ b/core/src/rpc_pubsub.rs @@ -920,11 +920,13 @@ mod tests { }); // Process votes and check they were notified. + let (s, _r) = unbounded(); ClusterInfoVoteListener::get_and_process_votes_for_tests( &votes_receiver, &vote_tracker, 0, rpc.subscriptions.clone(), + &s, ) .unwrap(); diff --git a/core/src/tpu.rs b/core/src/tpu.rs index 6f77824fea..d3df2144d0 100644 --- a/core/src/tpu.rs +++ b/core/src/tpu.rs @@ -5,7 +5,7 @@ use crate::{ banking_stage::BankingStage, broadcast_stage::{BroadcastStage, BroadcastStageType, RetransmitSlotsReceiver}, cluster_info::ClusterInfo, - cluster_info_vote_listener::{ClusterInfoVoteListener, VoteTracker}, + cluster_info_vote_listener::{ClusterInfoVoteListener, VerifiedVoteSender, VoteTracker}, fetch_stage::FetchStage, poh_recorder::{PohRecorder, WorkingBankEntry}, rpc_subscriptions::RpcSubscriptions, @@ -52,6 +52,7 @@ impl Tpu { shred_version: u16, vote_tracker: Arc, bank_forks: Arc>, + verified_vote_sender: VerifiedVoteSender, ) -> Self { let (packet_sender, packet_receiver) = channel(); let fetch_stage = FetchStage::new_with_sender( @@ -68,22 +69,23 @@ impl Tpu { SigVerifyStage::new(packet_receiver, verified_sender, verifier) }; - let (verified_vote_sender, verified_vote_receiver) = unbounded(); + let (verified_vote_packets_sender, verified_vote_packets_receiver) = unbounded(); let cluster_info_vote_listener = ClusterInfoVoteListener::new( &exit, cluster_info.clone(), - verified_vote_sender, + verified_vote_packets_sender, &poh_recorder, vote_tracker, bank_forks, subscriptions.clone(), + verified_vote_sender, ); let banking_stage = BankingStage::new( &cluster_info, poh_recorder, verified_receiver, - verified_vote_receiver, + verified_vote_packets_receiver, transaction_status_sender, ); diff --git a/core/src/tvu.rs b/core/src/tvu.rs index 41cb091e44..b43c258bfb 100644 --- a/core/src/tvu.rs +++ b/core/src/tvu.rs @@ -6,7 +6,7 @@ use crate::{ accounts_hash_verifier::AccountsHashVerifier, broadcast_stage::RetransmitSlotsSender, cluster_info::ClusterInfo, - cluster_info_vote_listener::VoteTracker, + cluster_info_vote_listener::{VerifiedVoteReceiver, VoteTracker}, cluster_slots::ClusterSlots, commitment::BlockCommitmentCache, ledger_cleanup_service::LedgerCleanupService, @@ -96,6 +96,7 @@ impl Tvu { snapshot_package_sender: Option, vote_tracker: Arc, retransmit_slots_sender: RetransmitSlotsSender, + verified_vote_receiver: VerifiedVoteReceiver, tvu_config: TvuConfig, ) -> Self { let keypair: Arc = cluster_info.keypair.clone(); @@ -146,7 +147,7 @@ impl Tvu { tvu_config.shred_version, cluster_slots.clone(), duplicate_slots_reset_sender, - vote_tracker.clone(), + verified_vote_receiver, ); let (ledger_cleanup_slot_sender, ledger_cleanup_slot_receiver) = channel(); @@ -278,6 +279,7 @@ pub mod tests { BlockCommitmentCache::default_with_blockstore(blockstore.clone()), )); let (retransmit_slots_sender, _retransmit_slots_receiver) = unbounded(); + let (_verified_vote_sender, verified_vote_receiver) = unbounded(); let bank_forks = Arc::new(RwLock::new(bank_forks)); let tvu = Tvu::new( &vote_keypair.pubkey(), @@ -310,6 +312,7 @@ pub mod tests { None, Arc::new(VoteTracker::new(&bank)), retransmit_slots_sender, + verified_vote_receiver, TvuConfig::default(), ); exit.store(true, Ordering::Relaxed); diff --git a/core/src/validator.rs b/core/src/validator.rs index 6a321773de..5b535988bd 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -411,6 +411,7 @@ impl Validator { let vote_tracker = Arc::new(VoteTracker::new(bank_forks.read().unwrap().root_bank())); let (retransmit_slots_sender, retransmit_slots_receiver) = unbounded(); + let (verified_vote_sender, verified_vote_receiver) = unbounded(); let tvu = Tvu::new( vote_account, authorized_voter_keypairs, @@ -455,6 +456,7 @@ impl Validator { snapshot_package_sender, vote_tracker.clone(), retransmit_slots_sender, + verified_vote_receiver, TvuConfig { max_ledger_shreds: config.max_ledger_shreds, halt_on_trusted_validators_accounts_hash_mismatch: config @@ -481,6 +483,7 @@ impl Validator { node.info.shred_version, vote_tracker, bank_forks, + verified_vote_sender, ); datapoint_info!("validator-new", ("id", id.to_string(), String)); diff --git a/core/src/verified_vote_packets.rs b/core/src/verified_vote_packets.rs index 2cf566e898..1458fb64b2 100644 --- a/core/src/verified_vote_packets.rs +++ b/core/src/verified_vote_packets.rs @@ -1,5 +1,5 @@ use crate::{ - cluster_info_vote_listener::VerifiedVotePacketsReceiver, crds_value::CrdsValueLabel, + cluster_info_vote_listener::VerifiedLabelVotePacketsReceiver, crds_value::CrdsValueLabel, result::Result, }; use solana_perf::packet::Packets; @@ -18,7 +18,7 @@ impl Deref for VerifiedVotePackets { impl VerifiedVotePackets { pub fn get_and_process_vote_packets( &mut self, - vote_packets_receiver: &VerifiedVotePacketsReceiver, + vote_packets_receiver: &VerifiedLabelVotePacketsReceiver, last_update_version: &mut u64, ) -> Result<()> { let timer = Duration::from_millis(200); diff --git a/core/src/window_service.rs b/core/src/window_service.rs index 19bdcbc965..8f5e89090c 100644 --- a/core/src/window_service.rs +++ b/core/src/window_service.rs @@ -3,7 +3,7 @@ //! use crate::{ cluster_info::ClusterInfo, - cluster_info_vote_listener::VoteTracker, + cluster_info_vote_listener::VerifiedVoteReceiver, cluster_slots::ClusterSlots, repair_response, repair_service::{RepairInfo, RepairService}, @@ -302,7 +302,7 @@ impl WindowService { leader_schedule_cache: &Arc, shred_filter: F, cluster_slots: Arc, - vote_tracker: Arc, + verified_vote_receiver: VerifiedVoteReceiver, ) -> WindowService where F: 'static @@ -319,7 +319,7 @@ impl WindowService { cluster_info.clone(), repair_info, cluster_slots, - vote_tracker, + verified_vote_receiver, ); let (insert_sender, insert_receiver) = unbounded(); diff --git a/metrics/scripts/grafana-provisioning/dashboards/cluster-monitor.json b/metrics/scripts/grafana-provisioning/dashboards/cluster-monitor.json index 12069b53b0..46e915d3ed 100644 --- a/metrics/scripts/grafana-provisioning/dashboards/cluster-monitor.json +++ b/metrics/scripts/grafana-provisioning/dashboards/cluster-monitor.json @@ -45,6 +45,19 @@ } ], "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [], + "title": "Summary", + "type": "row" + }, { "cacheTimeout": null, "colorBackground": false, @@ -69,7 +82,7 @@ "x": 0, "y": 0 }, - "id": 1, + "id": 2, "interval": null, "links": [], "mappingType": 1, @@ -182,7 +195,7 @@ "x": 12, "y": 0 }, - "id": 2, + "id": 3, "interval": null, "links": [], "mappingType": 1, @@ -283,7 +296,7 @@ "x": 14, "y": 0 }, - "id": 3, + "id": 4, "legend": { "alignAsTable": true, "avg": false, @@ -411,7 +424,7 @@ "x": 0, "y": 2 }, - "id": 4, + "id": 5, "interval": null, "links": [], "mappingType": 1, @@ -523,7 +536,7 @@ "x": 4, "y": 2 }, - "id": 5, + "id": 6, "interval": null, "links": [], "mappingType": 1, @@ -635,7 +648,7 @@ "x": 8, "y": 2 }, - "id": 6, + "id": 7, "interval": null, "links": [], "mappingType": 1, @@ -747,7 +760,7 @@ "x": 11, "y": 2 }, - "id": 7, + "id": 8, "interval": null, "links": [], "mappingType": 1, @@ -859,7 +872,7 @@ "x": 0, "y": 4 }, - "id": 8, + "id": 9, "interval": null, "links": [], "mappingType": 1, @@ -970,7 +983,7 @@ "x": 3, "y": 4 }, - "id": 9, + "id": 10, "interval": null, "links": [], "mappingType": 1, @@ -1081,7 +1094,7 @@ "x": 6, "y": 4 }, - "id": 10, + "id": 11, "interval": null, "links": [], "mappingType": 1, @@ -1192,7 +1205,7 @@ "x": 9, "y": 4 }, - "id": 11, + "id": 12, "interval": null, "links": [], "mappingType": 1, @@ -1303,7 +1316,7 @@ "x": 11, "y": 4 }, - "id": 12, + "id": 13, "interval": null, "links": [], "mappingType": 1, @@ -1404,7 +1417,7 @@ "x": 14, "y": 5 }, - "id": 13, + "id": 14, "legend": { "alignAsTable": true, "avg": false, @@ -1560,7 +1573,7 @@ "x": 0, "y": 6 }, - "id": 14, + "id": 15, "legend": { "avg": false, "current": false, @@ -1826,7 +1839,7 @@ "x": 14, "y": 10 }, - "id": 15, + "id": 16, "legend": { "alignAsTable": false, "avg": false, @@ -1938,7 +1951,7 @@ "x": 0, "y": 15 }, - "id": 16, + "id": 17, "panels": [], "title": "Stability", "type": "row" @@ -1956,7 +1969,7 @@ "x": 0, "y": 16 }, - "id": 17, + "id": 18, "legend": { "alignAsTable": false, "avg": false, @@ -2112,7 +2125,7 @@ "x": 8, "y": 16 }, - "id": 18, + "id": 19, "legend": { "avg": false, "current": false, @@ -2348,7 +2361,7 @@ "x": 15, "y": 16 }, - "id": 19, + "id": 20, "interval": null, "links": [], "mappingType": 1, @@ -2459,7 +2472,7 @@ "x": 19, "y": 16 }, - "id": 20, + "id": 21, "interval": null, "links": [], "mappingType": 1, @@ -2570,7 +2583,7 @@ "x": 22, "y": 16 }, - "id": 21, + "id": 22, "interval": null, "links": [], "mappingType": 1, @@ -2670,7 +2683,7 @@ "x": 15, "y": 18 }, - "id": 22, + "id": 23, "legend": { "alignAsTable": true, "avg": false, @@ -2788,7 +2801,7 @@ "x": 20, "y": 18 }, - "id": 23, + "id": 24, "legend": { "alignAsTable": true, "avg": false, @@ -2906,7 +2919,7 @@ "x": 0, "y": 19 }, - "id": 24, + "id": 25, "legend": { "alignAsTable": false, "avg": false, @@ -3059,7 +3072,7 @@ "x": 0, "y": 22 }, - "id": 25, + "id": 26, "links": [], "pageSize": null, "scroll": true, @@ -3147,7 +3160,7 @@ "x": 15, "y": 22 }, - "id": 26, + "id": 27, "legend": { "avg": false, "current": false, @@ -3628,7 +3641,7 @@ "x": 0, "y": 28 }, - "id": 27, + "id": 28, "links": [], "pageSize": null, "scroll": true, @@ -3713,7 +3726,7 @@ "x": 8, "y": 28 }, - "id": 28, + "id": 29, "links": [], "pageSize": null, "scroll": true, @@ -3798,7 +3811,7 @@ "x": 16, "y": 28 }, - "id": 29, + "id": 30, "links": [], "pageSize": null, "scroll": true, @@ -3881,7 +3894,7 @@ "x": 0, "y": 34 }, - "id": 30, + "id": 31, "panels": [], "title": "Validator Streamer", "type": "row" @@ -3899,7 +3912,7 @@ "x": 0, "y": 35 }, - "id": 31, + "id": 32, "legend": { "alignAsTable": false, "avg": false, @@ -4138,7 +4151,7 @@ "x": 8, "y": 35 }, - "id": 32, + "id": 33, "legend": { "alignAsTable": false, "avg": false, @@ -4420,7 +4433,7 @@ "x": 16, "y": 35 }, - "id": 33, + "id": 34, "legend": { "alignAsTable": false, "avg": false, @@ -4631,7 +4644,7 @@ "x": 0, "y": 41 }, - "id": 34, + "id": 35, "legend": { "alignAsTable": false, "avg": false, @@ -4998,7 +5011,7 @@ "x": 8, "y": 41 }, - "id": 35, + "id": 36, "legend": { "alignAsTable": false, "avg": false, @@ -5170,7 +5183,7 @@ "x": 16, "y": 41 }, - "id": 36, + "id": 37, "legend": { "alignAsTable": false, "avg": false, @@ -5491,7 +5504,7 @@ "x": 0, "y": 47 }, - "id": 37, + "id": 38, "legend": { "alignAsTable": false, "avg": false, @@ -5921,7 +5934,7 @@ "x": 8, "y": 47 }, - "id": 38, + "id": 39, "legend": { "alignAsTable": false, "avg": false, @@ -6154,7 +6167,7 @@ "x": 16, "y": 47 }, - "id": 39, + "id": 73, "legend": { "alignAsTable": false, "avg": false, @@ -6173,7 +6186,16 @@ "pointradius": 2, "points": false, "renderer": "flot", - "seriesOverrides": [], + "seriesOverrides": [ + { + "alias": "replay-slot-stats.total_shreds", + "yaxis": 2 + }, + { + "alias": "replay-slot-stats.total_entries", + "yaxis": 2 + } + ], "spaceLength": 10, "stack": false, "steppedLine": false, @@ -6197,7 +6219,7 @@ "measurement": "cluster_info-vote-count", "orderByTime": "ASC", "policy": "autogen", - "query": "SELECT mean(\"count\") FROM \"$testnet\".\"autogen\".\"bank-forks_set_root_ms\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "query": "SELECT mean(\"set-root-elapsed\") AS \"set-root-elapsed\" FROM \"$testnet\".\"autogen\".\"serve_repair-repair-timing\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", "rawQuery": true, "refId": "A", "resultFormat": "time_series", @@ -6236,7 +6258,7 @@ "measurement": "cluster_info-vote-count", "orderByTime": "ASC", "policy": "autogen", - "query": "SELECT mean(\"squash_accounts_ms\") AS \"squash_account\" FROM \"$testnet\".\"autogen\".\"tower-observed\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "query": "SELECT mean(\"get-votes-elapsed\") AS \"get-votes-elapsed\" FROM \"$testnet\".\"autogen\".\"serve_repair-repair-timing\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", "rawQuery": true, "refId": "B", "resultFormat": "time_series", @@ -6275,7 +6297,7 @@ "measurement": "cluster_info-vote-count", "orderByTime": "ASC", "policy": "autogen", - "query": "SELECT mean(\"count\") AS \"serialize_bank\" FROM \"$testnet\".\"autogen\".\"bank-serialize-ms\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "query": "SELECT mean(\"add-votes-elapsed\") AS \"add-votes-elapsed\" FROM \"$testnet\".\"autogen\".\"serve_repair-repair-timing\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", "rawQuery": true, "refId": "C", "resultFormat": "time_series", @@ -6314,7 +6336,7 @@ "measurement": "cluster_info-vote-count", "orderByTime": "ASC", "policy": "autogen", - "query": "SELECT mean(\"count\") AS \"add_snapshot_ms\" FROM \"$testnet\".\"autogen\".\"add-snapshot-ms\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "query": "SELECT mean(\"get-best-orphans-elapsed\") AS \"get-best-orphans-elapsed\" FROM \"$testnet\".\"autogen\".\"serve_repair-repair-timing\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", "rawQuery": true, "refId": "C", "resultFormat": "time_series", @@ -6353,7 +6375,7 @@ "measurement": "cluster_info-vote-count", "orderByTime": "ASC", "policy": "autogen", - "query": "SELECT mean(\"duration\") AS \"serialize_account_storage\" FROM \"$testnet\".\"autogen\".\"serialize_account_storage_ms\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "query": "SELECT mean(\"get-best-shreds-elapsed\") AS \"get-best-shreds-elapsed\" FROM \"$testnet\".\"autogen\".\"serve_repair-repair-timing\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", "rawQuery": true, "refId": "C", "resultFormat": "time_series", @@ -6392,9 +6414,126 @@ "measurement": "cluster_info-vote-count", "orderByTime": "ASC", "policy": "autogen", - "query": "SELECT mean(\"squash_cache_ms\") AS \"squash_cache\" FROM \"$testnet\".\"autogen\".\"tower-observed\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "query": "SELECT mean(\"update-completed-slots-elapsed\") AS \"update-completed-slots-elapsed\" FROM \"$testnet\".\"autogen\".\"serve_repair-repair-timing\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", "rawQuery": true, - "refId": "C", + "refId": "D", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(\"update-completed-slots-elapsed\") AS \"update-completed-slots-elapsed\" FROM \"$testnet\".\"autogen\".\"serve_repair-repair-timing\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "E", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(\"send-repairs-elapsed\") AS \"send-repairs-elapsed\" FROM \"$testnet\".\"autogen\".\"serve_repair-repair-timing\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "F", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(\"lowest-slot-elapsed\") AS \"lowest-slot-elapsed\" FROM \"$testnet\".\"autogen\".\"serve_repair-repair-timing\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "G", "resultFormat": "time_series", "select": [ [ @@ -6416,7 +6555,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Time spent in squashing ($hostid)", + "title": "Repair Timing", "tooltip": { "shared": true, "sort": 0, @@ -6432,7 +6571,7 @@ }, "yaxes": [ { - "format": "ms", + "format": "µs", "label": null, "logBase": 1, "max": null, @@ -6440,7 +6579,7 @@ "show": true }, { - "format": "short", + "format": "µs", "label": null, "logBase": 1, "max": null, @@ -6480,7 +6619,7 @@ "x": 0, "y": 53 }, - "id": 40, + "id": 41, "legend": { "alignAsTable": false, "avg": false, @@ -6782,7 +6921,7 @@ "x": 8, "y": 53 }, - "id": 41, + "id": 42, "legend": { "alignAsTable": false, "avg": false, @@ -6923,26 +7062,32 @@ } }, { - "aliasColors": {}, + "aliasColors": { + "cluster-info.repair": "#ba43a9", + "replay_stage-new_leader.last": "#00ffbb", + "tower-observed.squash_account": "#0a437c", + "tower-observed.squash_cache": "#ea6460", + "window-service.receive": "#b7dbab", + "window-stage.consumed": "#5195ce" + }, "bars": false, "dashLength": 10, "dashes": false, "datasource": "$datasource", "fill": 1, "gridPos": { - "h": 5, + "h": 6, "w": 8, "x": 16, "y": 53 }, - "id": 42, + "id": 40, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, - "rightSide": false, "show": true, "total": false, "values": false @@ -6950,9 +7095,9 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "nullPointMode": "connected", "percentage": false, - "pointradius": 5, + "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], @@ -6975,10 +7120,50 @@ "type": "fill" } ], + "hide": false, "measurement": "cluster_info-vote-count", "orderByTime": "ASC", "policy": "autogen", - "query": "SELECT sum(\"recovered\") AS \"recovered\" FROM \"$testnet\".\"autogen\".\"blockstore-erasure\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval) FILL(0)", + "query": "SELECT mean(\"count\") FROM \"$testnet\".\"autogen\".\"bank-forks_set_root_ms\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(\"squash_accounts_ms\") AS \"squash_account\" FROM \"$testnet\".\"autogen\".\"tower-observed\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", "rawQuery": true, "refId": "B", "resultFormat": "time_series", @@ -6997,12 +7182,168 @@ ] ], "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(\"count\") AS \"serialize_bank\" FROM \"$testnet\".\"autogen\".\"bank-serialize-ms\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(\"count\") AS \"add_snapshot_ms\" FROM \"$testnet\".\"autogen\".\"add-snapshot-ms\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(\"duration\") AS \"serialize_account_storage\" FROM \"$testnet\".\"autogen\".\"serialize_account_storage_ms\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(\"squash_cache_ms\") AS \"squash_cache\" FROM \"$testnet\".\"autogen\".\"tower-observed\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] } ], "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Erasure Recovery ($hostid)", + "title": "Time spent in squashing ($hostid)", "tooltip": { "shared": true, "sort": 0, @@ -7018,7 +7359,7 @@ }, "yaxes": [ { - "format": "short", + "format": "ms", "label": null, "logBase": 1, "max": null, @@ -7052,7 +7393,7 @@ "x": 0, "y": 58 }, - "id": 43, + "id": 44, "legend": { "alignAsTable": false, "avg": false, @@ -7211,7 +7552,7 @@ "x": 8, "y": 58 }, - "id": 44, + "id": 45, "legend": { "alignAsTable": false, "avg": false, @@ -7322,18 +7663,19 @@ "datasource": "$datasource", "fill": 1, "gridPos": { - "h": 6, + "h": 5, "w": 8, "x": 16, - "y": 58 + "y": 60 }, - "id": 45, + "id": 43, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, "total": false, "values": false @@ -7369,7 +7711,45 @@ "measurement": "cluster_info-vote-count", "orderByTime": "ASC", "policy": "autogen", - "query": "SELECT count(\"slot\") AS \"num_skipped\" FROM \"$testnet\".\"autogen\".\"replay_stage-skip_leader_slot\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time(1s) fill(null)\n", + "query": "SELECT sum(\"recovered\") AS \"recovered\" FROM \"$testnet\".\"autogen\".\"blockstore-erasure\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval) FILL(0)", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT sum(\"num_recovered\") AS \"num_recovered\" FROM \"$testnet\".\"autogen\".\"recv-window-insert-shreds\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval) FILL(0)", "rawQuery": true, "refId": "A", "resultFormat": "time_series", @@ -7393,7 +7773,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Skipped Leader Slots ($hostid)", + "title": "Erasure Recovery ($hostid)", "tooltip": { "shared": true, "sort": 0, @@ -7409,7 +7789,7 @@ }, "yaxes": [ { - "format": "none", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -7417,7 +7797,7 @@ "show": true }, { - "format": "none", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -8213,12 +8593,130 @@ "datasource": "$datasource", "fill": 1, "gridPos": { - "h": 5, - "w": 12, - "x": 12, + "h": 6, + "w": 8, + "x": 16, + "y": 65 + }, + "id": 46, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT count(\"slot\") AS \"num_skipped\" FROM \"$testnet\".\"autogen\".\"replay_stage-skip_leader_slot\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time(1s) fill(null)\n", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Skipped Leader Slots ($hostid)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "poh-service.hashes": "#c15c17" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, "y": 71 }, - "id": 52, + "id": 49, "legend": { "alignAsTable": false, "avg": false, @@ -8572,6 +9070,19 @@ "alignLevel": null } }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 77 + }, + "id": 50, + "panels": [], + "title": "Tower Consensus", + "type": "row" + }, { "aliasColors": { "cluster-info.repair": "#ba43a9", @@ -8587,11 +9098,11 @@ "fill": 1, "gridPos": { "h": 5, - "w": 12, - "x": 12, - "y": 77 + "w": 8, + "x": 0, + "y": 78 }, - "id": 55, + "id": 51, "legend": { "alignAsTable": false, "avg": false, @@ -8755,10 +9266,1179 @@ "gridPos": { "h": 6, "w": 8, + "x": 8, + "y": 78 + }, + "id": 52, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 3, + "points": true, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT MEAN(\"slot\") FROM \"$testnet\".\"autogen\".\"snapshot-package\" WHERE $timeFilter GROUP BY time($__interval)\n\n\n", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": true, + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT MAX(\"slot\") as \"Max\" FROM \"$testnet\".\"autogen\".\"snapshot-package\" WHERE $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": true, + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT MIN(\"slot\") as \"Min\" FROM \"$testnet\".\"autogen\".\"snapshot-package\" WHERE $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Slot", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "none", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 78 + }, + "id": 53, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": true, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": true, + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT MIN(\"size\") as \"Min\" FROM \"$testnet\".\"autogen\".\"snapshot-package\" WHERE $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT max(\"slot\") FROM \"$testnet\".\"autogen\".\"replay_stage-new_leader\" WHERE $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT min(\"slot\") FROM \"$testnet\".\"autogen\".\"replay_stage-new_leader\" WHERE $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Leader Change", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": "slot", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, "x": 0, "y": 83 }, + "id": 54, + "panels": [], + "repeat": null, + "title": "IP Network", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 84 + }, + "id": 55, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 1, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(\"packets_received\") as \"packets_received\" FROM \"$testnet\".\"autogen\".\"net-stats\" WHERE hostname =~ /$hostid/ AND $timeFilter GROUP BY time(5s) fill(null)\n\n\n\n", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(\"receive_errors\") as \"receive_errors\" FROM \"$testnet\".\"autogen\".\"net-stats\" WHERE hostname =~ /$hostid/ AND $timeFilter GROUP BY time(5s) fill(null)\n\n\n\n", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(\"rcvbuf_errors\") as \"rcvbuf_errors\" FROM \"$testnet\".\"autogen\".\"net-stats\" WHERE hostname =~ /$hostid/ AND $timeFilter GROUP BY time(5s) fill(null)\n\n\n\n", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(\"packets_sent\") as \"packets_sent\" FROM \"$testnet\".\"autogen\".\"net-stats\" WHERE hostname =~ /$hostid/ AND $timeFilter GROUP BY time(5s) fill(null)\n\n\n\n", + "rawQuery": true, + "refId": "D", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "UDP Net Stats ($hostid)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 84 + }, + "id": 56, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 1, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(\"in_octets\") as \"recv\" FROM \"$testnet\".\"autogen\".\"net-stats\" WHERE $timeFilter GROUP BY time(1s) fill(null)\n\n", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(\"out_octets\") as \"sent\" FROM \"$testnet\".\"autogen\".\"net-stats\" WHERE $timeFilter GROUP BY time(1s) fill(null)\n\n", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Total IP traffic (octets)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 89 + }, "id": 57, + "panels": [], + "title": "Signature Verification", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 90 + }, + "id": 58, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "sigverify_stage-total_verify_time.num_packets", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(\"num_packets\") AS \"num_packets\" FROM \"$testnet\".\"autogen\".\"sigverify_stage-total_verify_time\" WHERE $timeFilter GROUP BY time(500ms) FILL(0)\n\n", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(\"verify_time_ms\") AS \"verify_time\" FROM \"$testnet\".\"autogen\".\"sigverify_stage-total_verify_time\" WHERE $timeFilter GROUP BY time(500ms) FILL(0)\n\n", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(\"recv_time\") AS \"recv_time\" FROM \"$testnet\".\"autogen\".\"sigverify_stage-total_verify_time\" WHERE $timeFilter GROUP BY time(500ms) FILL(0)\n\n", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "sigverify_stage - Batch Verification Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "cluster-info.repair": "#ba43a9", + "replay_stage-new_leader.last": "#00ffbb", + "tower-vote.last": "#00ffbb", + "window-service.receive": "#b7dbab", + "window-stage.consumed": "#5195ce" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 90 + }, + "id": 59, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 2, + "points": true, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT last(\"slot_height\") as \"slot_height\" FROM \"$testnet\".\"autogen\".\"bank-new_from_parent-heights\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT last(\"block_height\") as \"block_height\" FROM \"$testnet\".\"autogen\".\"bank-new_from_parent-heights\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Bank Height / Slot Distance ($hostid)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 95 + }, + "id": 60, + "panels": [], + "title": "Snapshots", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 96 + }, + "id": 61, "legend": { "avg": false, "current": false, @@ -8948,9 +10628,9 @@ "h": 6, "w": 8, "x": 8, - "y": 83 + "y": 96 }, - "id": 58, + "id": 62, "legend": { "avg": false, "current": false, @@ -9216,9 +10896,9 @@ "h": 6, "w": 8, "x": 16, - "y": 83 + "y": 96 }, - "id": 59, + "id": 63, "legend": { "avg": false, "current": false, @@ -9399,6 +11079,461 @@ "alignLevel": null } }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 102 + }, + "id": 64, + "legend": { + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 1, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + {} + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT MEAN(\"size\") FROM \"$testnet\".\"autogen\".\"snapshot-package\" WHERE $timeFilter GROUP BY time($__interval)\n\n\n", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT MAX(\"size\") FROM \"$testnet\".\"autogen\".\"snapshot-bank-file\" WHERE $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "D", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT MAX(\"size\") FROM \"$testnet\".\"autogen\".\"snapshot-status-cache-file\" WHERE $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "E", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Size", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "decbytes", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 108 + }, + "id": 65, + "panels": [], + "title": "RPC Send Transaction Service", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 109 + }, + "id": 66, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 4, + "points": true, + "renderer": "flot", + "seriesOverrides": [ + {} + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": true, + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT MIN(\"duration_ms\") as \"Min\" FROM \"$testnet\".\"autogen\".\"snapshot-package\" WHERE $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 109 + }, + "id": 67, + "legend": { + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 0, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 3, + "points": true, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT MAX(\"duration_ms\") FROM \"$testnet\".\"autogen\".\"snapshot-package\" WHERE $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT MEAN(\"duration_ms\") FROM \"$testnet\".\"autogen\".\"snapshot-package\" WHERE $timeFilter GROUP BY time($__interval)\n\n\n", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { "aliasColors": {}, "bars": false, @@ -9601,9 +11736,9 @@ "h": 1, "w": 24, "x": 0, - "y": 95 + "y": 115 }, - "id": 60, + "id": 68, "panels": [], "title": "Bench TPS", "type": "row" @@ -9619,9 +11754,9 @@ "h": 5, "w": 7, "x": 0, - "y": 96 + "y": 116 }, - "id": 61, + "id": 69, "legend": { "avg": false, "current": false, @@ -9734,9 +11869,9 @@ "h": 5, "w": 7, "x": 7, - "y": 96 + "y": 116 }, - "id": 62, + "id": 70, "legend": { "alignAsTable": false, "avg": false, @@ -9959,9 +12094,9 @@ "h": 5, "w": 10, "x": 14, - "y": 96 + "y": 116 }, - "id": 63, + "id": 71, "links": [], "pageSize": null, "scroll": true, @@ -10047,9 +12182,9 @@ "h": 4, "w": 10, "x": 0, - "y": 101 + "y": 121 }, - "id": 64, + "id": 72, "legend": { "avg": false, "current": false, @@ -10198,7 +12333,7 @@ "selected": false, "text": "Testnet", "value": "tds" - } + } ], "query": "devnet,mainnet-beta,tds", "type": "custom" @@ -10252,4 +12387,4 @@ "title": "Cluster Telemetry", "uid": "monitor", "version": 1 -} +} \ No newline at end of file