From ccb4e32ee0da1e4af26884653fff0f611e8e097b Mon Sep 17 00:00:00 2001 From: carllin Date: Wed, 16 Oct 2019 14:32:18 -0700 Subject: [PATCH] ReplayStage metrics (#6358) * ReplayStage metrics * Add more metrics * Refactor get_slot_entries_with_shred_count() to detect wasted work * Update dashboard * Update broadcast slots to micros * Add broadcast dashboard --- core/src/blocktree.rs | 107 +- core/src/broadcast_stage.rs | 2 +- .../broadcast_stage/standard_broadcast_run.rs | 2 +- core/src/replay_stage.rs | 145 ++- .../dashboards/testnet-monitor.json | 1082 ++++++++++++++--- 5 files changed, 1053 insertions(+), 285 deletions(-) diff --git a/core/src/blocktree.rs b/core/src/blocktree.rs index 541841ccbb..523d7f2bc6 100644 --- a/core/src/blocktree.rs +++ b/core/src/blocktree.rs @@ -25,6 +25,7 @@ use std::path::{Path, PathBuf}; use std::rc::Rc; use std::sync::mpsc::{sync_channel, Receiver, SyncSender, TrySendError}; use std::sync::{Arc, RwLock}; +use std::time::Instant; pub use self::meta::*; pub use self::rooted_slot_iterator::*; @@ -981,65 +982,73 @@ impl Blocktree { &self, slot: u64, mut start_index: u64, - ) -> Result<(Vec, usize)> { - // Find the next consecutive block of shreds. - let mut serialized_shreds: Vec> = vec![]; - let data_shred_cf = self.db.column::(); - - while let Some(serialized_shred) = data_shred_cf.get_bytes((slot, start_index))? { - serialized_shreds.push(serialized_shred); - start_index += 1; - } - - trace!( - "Found {:?} shreds for slot {:?}", - serialized_shreds.len(), - slot - ); - - let mut shreds: Vec = serialized_shreds - .into_iter() - .filter_map(|serialized_shred| Shred::new_from_serialized_shred(serialized_shred).ok()) - .collect(); + ) -> Result<(Vec, usize, u64, u64)> { + let mut useful_time = 0; + let mut wasted_time = 0; let mut all_entries = vec![]; - let mut num = 0; + let mut num_shreds = 0; loop { - let mut look_for_last_shred = true; + let now = Instant::now(); + let mut res = self.get_entries_in_data_block(slot, &mut start_index); + let elapsed = now.elapsed().as_micros(); - let mut shred_chunk = vec![]; - while look_for_last_shred && !shreds.is_empty() { - let shred = shreds.remove(0); - if shred.data_complete() || shred.last_in_slot() { - look_for_last_shred = false; + if let Ok((ref mut entries, new_num_shreds)) = res { + if !entries.is_empty() { + all_entries.append(entries); + num_shreds += new_num_shreds; + useful_time += elapsed; + continue; } - shred_chunk.push(shred); } - debug!( - "{:?} shreds in last FEC set. Looking for last shred {:?}", - shred_chunk.len(), - look_for_last_shred - ); - - // Break if we didn't find the last shred (as more data is required) - if look_for_last_shred { - break; - } - - if let Ok(deshred_payload) = Shredder::deshred(&shred_chunk) { - let entries: Vec = bincode::deserialize(&deshred_payload)?; - trace!("Found entries: {:#?}", entries); - all_entries.extend(entries); - num += shred_chunk.len(); - } else { - debug!("Failed in deshredding shred payloads"); - break; - } + // All unsuccessful cases (errors, incomplete data blocks) will count as wasted work + wasted_time += elapsed; + res?; + break; } trace!("Found {:?} entries", all_entries.len()); - Ok((all_entries, num)) + Ok(( + all_entries, + num_shreds, + useful_time as u64, + wasted_time as u64, + )) + } + + pub fn get_entries_in_data_block( + &self, + slot: u64, + start_index: &mut u64, + ) -> Result<(Vec, usize)> { + let mut shred_chunk: Vec = vec![]; + let data_shred_cf = self.db.column::(); + while let Some(serialized_shred) = data_shred_cf.get_bytes((slot, *start_index))? { + *start_index += 1; + let new_shred = Shred::new_from_serialized_shred(serialized_shred).ok(); + if let Some(shred) = new_shred { + let is_complete = shred.data_complete() || shred.last_in_slot(); + shred_chunk.push(shred); + if is_complete { + if let Ok(deshred_payload) = Shredder::deshred(&shred_chunk) { + debug!("{:?} shreds in last FEC set", shred_chunk.len(),); + let entries: Vec = bincode::deserialize(&deshred_payload)?; + return Ok((entries, shred_chunk.len())); + } else { + debug!("Failed in deshredding shred payloads"); + break; + } + } + } else { + // Didn't find a valid shred, this slot is dead. + // TODO: Mark as dead, but have to carefully handle last shred of interrupted + // slots. + break; + } + } + + Ok((vec![], 0)) } // Returns slots connecting to any element of the list `slots`. diff --git a/core/src/broadcast_stage.rs b/core/src/broadcast_stage.rs index c3a3d5721c..94fc9e8044 100644 --- a/core/src/broadcast_stage.rs +++ b/core/src/broadcast_stage.rs @@ -298,7 +298,7 @@ mod test { ); let blocktree = broadcast_service.blocktree; - let (entries, _) = blocktree + let (entries, _, _, _) = blocktree .get_slot_entries_with_shred_count(slot, 0) .expect("Expect entries to be present"); assert_eq!(entries.len(), max_tick_height as usize); diff --git a/core/src/broadcast_stage/standard_broadcast_run.rs b/core/src/broadcast_stage/standard_broadcast_run.rs index 1a7cfa847c..63b7b01b43 100644 --- a/core/src/broadcast_stage/standard_broadcast_run.rs +++ b/core/src/broadcast_stage/standard_broadcast_run.rs @@ -270,7 +270,7 @@ impl StandardBroadcastRun { ), ( "slot_broadcast_time", - self.slot_broadcast_start.unwrap().elapsed().as_millis() as i64, + self.slot_broadcast_start.unwrap().elapsed().as_micros() as i64, i64 ), ); diff --git a/core/src/replay_stage.rs b/core/src/replay_stage.rs index 65b855b83c..9dd8e5cdec 100644 --- a/core/src/replay_stage.rs +++ b/core/src/replay_stage.rs @@ -59,20 +59,73 @@ pub struct ReplayStage { confidence_service: AggregateConfidenceService, } -#[derive(Default)] +struct ReplaySlotStats { + // Per-slot elapsed time + slot: u64, + fetch_entries_elapsed: u64, + fetch_entries_fail_elapsed: u64, + entry_verification_elapsed: u64, + replay_elapsed: u64, + replay_start: Instant, +} + +impl ReplaySlotStats { + pub fn new(slot: u64) -> Self { + Self { + slot, + fetch_entries_elapsed: 0, + fetch_entries_fail_elapsed: 0, + entry_verification_elapsed: 0, + replay_elapsed: 0, + replay_start: Instant::now(), + } + } + + pub fn report_stats(&self, total_entries: usize, total_shreds: usize) { + datapoint_info!( + "replay-slot-stats", + ("slot", self.slot as i64, i64), + ("fetch_entries_time", self.fetch_entries_elapsed as i64, i64), + ( + "fetch_entries_fail_time", + self.fetch_entries_fail_elapsed as i64, + i64 + ), + ( + "entry_verification_time", + self.entry_verification_elapsed as i64, + i64 + ), + ("replay_time", self.replay_elapsed as i64, i64), + ( + "replay_total_elapsed", + self.replay_start.elapsed().as_micros() as i64, + i64 + ), + ("total_entries", total_entries as i64, i64), + ("total_shreds", total_shreds as i64, i64), + ); + } +} + struct ForkProgress { last_entry: Hash, - num_blobs: usize, + num_shreds: usize, + num_entries: usize, started_ms: u64, is_dead: bool, + stats: ReplaySlotStats, } + impl ForkProgress { - pub fn new(last_entry: Hash) -> Self { + pub fn new(slot: u64, last_entry: Hash) -> Self { Self { last_entry, - num_blobs: 0, + num_shreds: 0, + num_entries: 0, started_ms: timing::timestamp(), is_dead: false, + stats: ReplaySlotStats::new(slot), } } } @@ -369,24 +422,41 @@ impl ReplayStage { progress: &mut HashMap, ) -> (Result<()>, usize) { let mut tx_count = 0; - let result = - Self::load_blocktree_entries(bank, blocktree, progress).and_then(|(entries, num)| { - debug!("Replaying {:?} entries, num {:?}", entries.len(), num); + let bank_progress = &mut progress + .entry(bank.slot()) + .or_insert_with(|| ForkProgress::new(bank.slot(), bank.last_blockhash())); + let now = Instant::now(); + let load_result = Self::load_blocktree_entries(bank, blocktree, bank_progress); + let fetch_entries_elapsed = now.elapsed().as_micros(); + + if load_result.is_err() { + bank_progress.stats.fetch_entries_fail_elapsed += fetch_entries_elapsed as u64; + } + let replay_result = + load_result.and_then(|(entries, num_shreds, useful_time, wasted_time)| { + trace!( + "Fetch entries for slot {}, {:?} entries, num shreds {:?}", + bank.slot(), + entries.len(), + num_shreds + ); tx_count += entries.iter().map(|e| e.transactions.len()).sum::(); - Self::replay_entries_into_bank(bank, entries, progress, num) + bank_progress.stats.fetch_entries_elapsed += useful_time as u64; + bank_progress.stats.fetch_entries_fail_elapsed += wasted_time as u64; + Self::replay_entries_into_bank(bank, entries, bank_progress, num_shreds) }); - if Self::is_replay_result_fatal(&result) { + if Self::is_replay_result_fatal(&replay_result) { warn!( "Fatal replay result in slot: {}, result: {:?}", bank.slot(), - result + replay_result ); datapoint_warn!("replay-stage-mark_dead_slot", ("slot", bank.slot(), i64),); Self::mark_dead_slot(bank.slot(), blocktree, progress); } - (result, tx_count) + (replay_result, tx_count) } fn mark_dead_slot(slot: u64, blocktree: &Blocktree, progress: &mut HashMap) { @@ -542,6 +612,11 @@ impl ReplayStage { } assert_eq!(*bank_slot, bank.slot()); if bank.tick_height() == bank.max_tick_height() { + if let Some(bank_progress) = &mut progress.get(&bank.slot()) { + bank_progress + .stats + .report_stats(bank_progress.num_entries, bank_progress.num_shreds); + } did_complete_bank = true; Self::process_completed_bank(my_pubkey, bank, slot_full_senders); } else { @@ -665,31 +740,26 @@ impl ReplayStage { fn load_blocktree_entries( bank: &Bank, blocktree: &Blocktree, - progress: &mut HashMap, - ) -> Result<(Vec, usize)> { + bank_progress: &mut ForkProgress, + ) -> Result<(Vec, usize, u64, u64)> { let bank_slot = bank.slot(); - let bank_progress = &mut progress - .entry(bank_slot) - .or_insert_with(|| ForkProgress::new(bank.last_blockhash())); - blocktree.get_slot_entries_with_shred_count(bank_slot, bank_progress.num_blobs as u64) + blocktree.get_slot_entries_with_shred_count(bank_slot, bank_progress.num_shreds as u64) } fn replay_entries_into_bank( bank: &Arc, entries: Vec, - progress: &mut HashMap, + bank_progress: &mut ForkProgress, num: usize, ) -> Result<()> { - let bank_progress = &mut progress - .entry(bank.slot()) - .or_insert_with(|| ForkProgress::new(bank.last_blockhash())); let result = Self::verify_and_process_entries( &bank, &entries, - &bank_progress.last_entry, - bank_progress.num_blobs, + bank_progress.num_shreds, + bank_progress, ); - bank_progress.num_blobs += num; + bank_progress.num_shreds += num; + bank_progress.num_entries += entries.len(); if let Some(last_entry) = entries.last() { bank_progress.last_entry = last_entry.hash; } @@ -697,15 +767,21 @@ impl ReplayStage { result } - pub fn verify_and_process_entries( + fn verify_and_process_entries( bank: &Arc, entries: &[Entry], - last_entry: &Hash, shred_index: usize, + bank_progress: &mut ForkProgress, ) -> Result<()> { - if !entries.verify(last_entry) { - warn!( - "entry verification failed {} {} {} {} {}", + let now = Instant::now(); + let last_entry = &bank_progress.last_entry; + let verify_result = entries.verify(last_entry); + let verify_entries_elapsed = now.elapsed().as_micros(); + bank_progress.stats.entry_verification_elapsed += verify_entries_elapsed as u64; + if !verify_result { + info!( + "entry verification failed, slot: {}, entry len: {}, tick_height: {}, last entry: {}, last_blockhash: {}, shred_index: {}", + bank.slot(), entries.len(), bank.tick_height(), last_entry, @@ -720,8 +796,13 @@ impl ReplayStage { ); return Err(Error::BlobError(BlobError::VerificationFailed)); } - blocktree_processor::process_entries(bank, entries, true)?; + let now = Instant::now(); + let res = blocktree_processor::process_entries(bank, entries, true); + let replay_elapsed = now.elapsed().as_micros(); + bank_progress.stats.replay_elapsed += replay_elapsed as u64; + + res?; Ok(()) } @@ -859,7 +940,7 @@ mod test { let bank0 = Bank::new(&genesis_block); let bank_forks = Arc::new(RwLock::new(BankForks::new(0, bank0))); let mut progress = HashMap::new(); - progress.insert(5, ForkProgress::new(Hash::default())); + progress.insert(5, ForkProgress::new(0, Hash::default())); ReplayStage::handle_new_root(&bank_forks, &mut progress); assert!(progress.is_empty()); } @@ -963,7 +1044,7 @@ mod test { let bank0 = Arc::new(Bank::new(&genesis_block)); let mut progress = HashMap::new(); let last_blockhash = bank0.last_blockhash(); - progress.insert(bank0.slot(), ForkProgress::new(last_blockhash)); + progress.insert(bank0.slot(), ForkProgress::new(0, last_blockhash)); let shreds = shred_to_insert(&last_blockhash, bank0.slot()); blocktree.insert_shreds(shreds, None).unwrap(); let (res, _tx_count) = diff --git a/metrics/scripts/grafana-provisioning/dashboards/testnet-monitor.json b/metrics/scripts/grafana-provisioning/dashboards/testnet-monitor.json index b66301b52c..ce086359c7 100644 --- a/metrics/scripts/grafana-provisioning/dashboards/testnet-monitor.json +++ b/metrics/scripts/grafana-provisioning/dashboards/testnet-monitor.json @@ -15,8 +15,8 @@ "editable": true, "gnetId": null, "graphTooltip": 0, - "id": 948, - "iteration": 1568522114560, + "id": 1033, + "iteration": 1571184520228, "links": [ { "asDropdown": true, @@ -6412,7 +6412,14 @@ } }, { - "aliasColors": {}, + "aliasColors": { + "cluster-info.repair": "#ba43a9", + "replay_stage-new_leader.last": "#00ffbb", + "tower-observed.squash_account": "#0a437c", + "tower-observed.squash_cache": "#ea6460", + "window-service.receive": "#b7dbab", + "window-stage.consumed": "#5195ce" + }, "bars": false, "dashLength": 10, "dashes": false, @@ -6424,7 +6431,7 @@ "x": 16, "y": 63 }, - "id": 47, + "id": 68, "legend": { "alignAsTable": false, "avg": false, @@ -6438,9 +6445,9 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "nullPointMode": "connected", "percentage": false, - "pointradius": 5, + "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], @@ -6463,10 +6470,11 @@ "type": "fill" } ], + "hide": false, "measurement": "cluster_info-vote-count", "orderByTime": "ASC", "policy": "autogen", - "query": "SELECT sum(\"count\") AS \"retransmit\" FROM \"$testnet\".\"autogen\".\"streamer-recv_window-retransmit\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)\n", + "query": "SELECT mean(\"fetch_entries_time\") AS \"fetch_entries_micros\" FROM \"$testnet\".\"autogen\".\"replay-slot-stats\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", "rawQuery": true, "refId": "A", "resultFormat": "time_series", @@ -6501,9 +6509,11 @@ "type": "fill" } ], + "hide": false, + "measurement": "cluster_info-vote-count", "orderByTime": "ASC", - "policy": "default", - "query": "SELECT sum(\"count\") AS \"window receive\" FROM \"$testnet\".\"autogen\".\"streamer-recv_window-recv\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)\n", + "policy": "autogen", + "query": "SELECT mean(\"fetch_entries_fail_time\") AS \"fetch_entries_fail_micros\" FROM \"$testnet\".\"autogen\".\"replay-slot-stats\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", "rawQuery": true, "refId": "B", "resultFormat": "time_series", @@ -6511,13 +6521,13 @@ [ { "params": [ - "value" + "count" ], "type": "field" }, { "params": [], - "type": "mean" + "type": "sum" } ] ], @@ -6538,9 +6548,11 @@ "type": "fill" } ], + "hide": false, + "measurement": "cluster_info-vote-count", "orderByTime": "ASC", - "policy": "default", - "query": "SELECT sum(\"count\") AS \"broadcast sent\" FROM \"$testnet\".\"autogen\".\"streamer-broadcast-sent\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)\n", + "policy": "autogen", + "query": "SELECT mean(\"entry_verification_time\") AS \"entry_verification_time_micros\" FROM \"$testnet\".\"autogen\".\"replay-slot-stats\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", "rawQuery": true, "refId": "C", "resultFormat": "time_series", @@ -6548,13 +6560,169 @@ [ { "params": [ - "value" + "count" ], "type": "field" }, { "params": [], - "type": "mean" + "type": "sum" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(\"replay_time\") AS \"replay_time_micros\" FROM \"$testnet\".\"autogen\".\"replay-slot-stats\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(\"replay_total_elapsed\") AS \"replay_total_elapsed_micros\" FROM \"$testnet\".\"autogen\".\"replay-slot-stats\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(\"total_entries\") AS \"total_entries\" FROM \"$testnet\".\"autogen\".\"replay-slot-stats\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(\"total_shreds\") AS \"total_shreds\" FROM \"$testnet\".\"autogen\".\"replay-slot-stats\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "D", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" } ] ], @@ -6564,7 +6732,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Send/Receive/Retransmit", + "title": "Time spent in ReplayStage ($hostid)", "tooltip": { "shared": true, "sort": 0, @@ -6580,7 +6748,7 @@ }, "yaxes": [ { - "format": "short", + "format": "µs", "label": null, "logBase": 1, "max": null, @@ -6879,6 +7047,9 @@ { "aliasColors": { "cluster-info.repair": "#ba43a9", + "replay_stage-new_leader.last": "#00ffbb", + "tower-observed.squash_account": "#0a437c", + "tower-observed.squash_cache": "#ea6460", "window-service.receive": "#b7dbab", "window-stage.consumed": "#5195ce" }, @@ -6888,12 +7059,12 @@ "datasource": "$datasource", "fill": 1, "gridPos": { - "h": 5, + "h": 6, "w": 8, "x": 16, "y": 69 }, - "id": 50, + "id": 69, "legend": { "alignAsTable": false, "avg": false, @@ -6904,13 +7075,13 @@ "total": false, "values": false }, - "lines": false, + "lines": true, "linewidth": 1, "links": [], "nullPointMode": "connected", "percentage": false, - "pointradius": 1, - "points": true, + "pointradius": 2, + "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, @@ -6932,23 +7103,25 @@ "type": "fill" } ], + "hide": false, + "measurement": "cluster_info-vote-count", "orderByTime": "ASC", - "policy": "default", - "query": "SELECT last(\"repair-highest-slot\") AS \"slot\" FROM \"$testnet\".\"autogen\".\"cluster_info-repair_highest\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "policy": "autogen", + "query": "SELECT mean(\"receive_time\") AS \"receive_time\" FROM \"$testnet\".\"autogen\".\"broadcast-bank-stats\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", "rawQuery": true, - "refId": "C", + "refId": "A", "resultFormat": "time_series", "select": [ [ { "params": [ - "value" + "count" ], "type": "field" }, { "params": [], - "type": "mean" + "type": "sum" } ] ], @@ -6969,23 +7142,181 @@ "type": "fill" } ], + "hide": false, + "measurement": "cluster_info-vote-count", "orderByTime": "ASC", - "policy": "default", - "query": "SELECT last(\"repair-highest-ix\") AS \"ix\" FROM \"$testnet\".\"autogen\".\"cluster_info-repair_highest\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "policy": "autogen", + "query": "SELECT mean(\"shredding_time\") AS \"shredding_time\" FROM \"$testnet\".\"autogen\".\"broadcast-bank-stats\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", "rawQuery": true, - "refId": "A", + "refId": "B", "resultFormat": "time_series", "select": [ [ { "params": [ - "value" + "count" ], "type": "field" }, { "params": [], - "type": "mean" + "type": "sum" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(\"clone_and_seed\") AS \"clone_and_seed\" FROM \"$testnet\".\"autogen\".\"broadcast-bank-stats\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(\"broadcast_time\") AS \"broadcast_time\" FROM \"$testnet\".\"autogen\".\"broadcast-bank-stats\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(\"num_shreds\") AS \"num_shreds\" FROM \"$testnet\".\"autogen\".\"broadcast-bank-stats\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT mean(\"slot_broadcast_time\") AS \"slot_broadcast_time\" FROM \"$testnet\".\"autogen\".\"broadcast-bank-stats\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" } ] ], @@ -6995,7 +7326,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Repair highest index in slot ($hostid)", + "title": "Time spent in Broadcast ($hostid)", "tooltip": { "shared": true, "sort": 0, @@ -7011,7 +7342,7 @@ }, "yaxes": [ { - "format": "short", + "format": "µs", "label": null, "logBase": 1, "max": null, @@ -7348,19 +7679,18 @@ "datasource": "$datasource", "fill": 1, "gridPos": { - "h": 5, + "h": 6, "w": 8, "x": 16, - "y": 74 + "y": 75 }, - "id": 67, + "id": 47, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, - "rightSide": false, "show": true, "total": false, "values": false @@ -7396,142 +7726,7 @@ "measurement": "cluster_info-vote-count", "orderByTime": "ASC", "policy": "autogen", - "query": "SELECT sum(\"recovered\") AS \"recovered\" FROM \"$testnet\".\"autogen\".\"blocktree-erasure\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval) FILL(0)", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "count" - ], - "type": "field" - }, - { - "params": [], - "type": "sum" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Erasure Recovery ($hostid)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 79 - }, - "id": 53, - "panels": [], - "title": "Tower Consensus", - "type": "row" - }, - { - "aliasColors": { - "cluster-info.repair": "#ba43a9", - "replay_stage-new_leader.last": "#00ffbb", - "window-service.receive": "#b7dbab", - "window-stage.consumed": "#5195ce" - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - "h": 5, - "w": 8, - "x": 0, - "y": 80 - }, - "id": 54, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": false, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 2, - "points": true, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "groupBy": [ - { - "params": [ - "$__interval" - ], - "type": "time" - }, - { - "params": [ - "null" - ], - "type": "fill" - } - ], - "hide": false, - "measurement": "cluster_info-vote-count", - "orderByTime": "ASC", - "policy": "autogen", - "query": "SELECT last(\"latest\") - last(\"root\") FROM \"$testnet\".\"autogen\".\"tower-vote\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "query": "SELECT sum(\"count\") AS \"retransmit\" FROM \"$testnet\".\"autogen\".\"streamer-recv_window-retransmit\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)\n", "rawQuery": true, "refId": "A", "resultFormat": "time_series", @@ -7568,7 +7763,7 @@ ], "orderByTime": "ASC", "policy": "default", - "query": "SELECT last(\"slot\") - last(\"root\") FROM \"$testnet\".\"autogen\".\"tower-observed\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "query": "SELECT sum(\"count\") AS \"window receive\" FROM \"$testnet\".\"autogen\".\"streamer-recv_window-recv\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)\n", "rawQuery": true, "refId": "B", "resultFormat": "time_series", @@ -7587,12 +7782,49 @@ ] ], "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT sum(\"count\") AS \"broadcast sent\" FROM \"$testnet\".\"autogen\".\"streamer-broadcast-sent\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)\n", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] } ], "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Tower Distance in Latest and Root Slot ($hostid)", + "title": "Send/Receive/Retransmit", "tooltip": { "shared": true, "sort": 0, @@ -7646,9 +7878,9 @@ "h": 5, "w": 8, "x": 8, - "y": 80 + "y": 79 }, - "id": 55, + "id": 56, "legend": { "alignAsTable": false, "avg": false, @@ -7789,6 +8021,451 @@ "alignLevel": null } }, + { + "aliasColors": { + "cluster-info.repair": "#ba43a9", + "window-service.receive": "#b7dbab", + "window-stage.consumed": "#5195ce" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + "h": 5, + "w": 8, + "x": 16, + "y": 81 + }, + "id": 50, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 1, + "points": true, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT last(\"repair-highest-slot\") AS \"slot\" FROM \"$testnet\".\"autogen\".\"cluster_info-repair_highest\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT last(\"repair-highest-ix\") AS \"ix\" FROM \"$testnet\".\"autogen\".\"cluster_info-repair_highest\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Repair highest index in slot ($hostid)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + "h": 5, + "w": 8, + "x": 16, + "y": 86 + }, + "id": 53, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT sum(\"recovered\") AS \"recovered\" FROM \"$testnet\".\"autogen\".\"blocktree-erasure\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval) FILL(0)", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Erasure Recovery ($hostid)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 91 + }, + "id": 54, + "panels": [], + "title": "Tower Consensus", + "type": "row" + }, + { + "aliasColors": { + "cluster-info.repair": "#ba43a9", + "replay_stage-new_leader.last": "#00ffbb", + "window-service.receive": "#b7dbab", + "window-stage.consumed": "#5195ce" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + "h": 5, + "w": 8, + "x": 0, + "y": 92 + }, + "id": 55, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 2, + "points": true, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "cluster_info-vote-count", + "orderByTime": "ASC", + "policy": "autogen", + "query": "SELECT last(\"latest\") - last(\"root\") FROM \"$testnet\".\"autogen\".\"tower-vote\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT last(\"slot\") - last(\"root\") FROM \"$testnet\".\"autogen\".\"tower-observed\" WHERE host_id::tag =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Tower Distance in Latest and Root Slot ($hostid)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { "aliasColors": { "bank-process_transactions-txs.transactions": "#e5a8e2", @@ -7806,9 +8483,9 @@ "h": 5, "w": 8, "x": 16, - "y": 80 + "y": 92 }, - "id": 56, + "id": 57, "legend": { "alignAsTable": false, "avg": false, @@ -7991,9 +8668,9 @@ "h": 1, "w": 24, "x": 0, - "y": 85 + "y": 97 }, - "id": 57, + "id": 58, "panels": [], "repeat": null, "title": "IP Network", @@ -8010,9 +8687,9 @@ "h": 5, "w": 12, "x": 0, - "y": 86 + "y": 98 }, - "id": 58, + "id": 59, "legend": { "alignAsTable": false, "avg": false, @@ -8243,9 +8920,9 @@ "h": 5, "w": 12, "x": 12, - "y": 86 + "y": 98 }, - "id": 59, + "id": 60, "legend": { "alignAsTable": false, "avg": false, @@ -8396,9 +9073,9 @@ "h": 1, "w": 24, "x": 0, - "y": 91 + "y": 103 }, - "id": 60, + "id": 61, "panels": [], "title": "Signature Verification", "type": "row" @@ -8414,9 +9091,9 @@ "h": 5, "w": 12, "x": 0, - "y": 92 + "y": 104 }, - "id": 61, + "id": 62, "legend": { "avg": false, "current": false, @@ -8535,9 +9212,9 @@ "h": 5, "w": 12, "x": 12, - "y": 92 + "y": 104 }, - "id": 62, + "id": 63, "legend": { "alignAsTable": false, "avg": false, @@ -8684,9 +9361,9 @@ "h": 1, "w": 24, "x": 0, - "y": 97 + "y": 109 }, - "id": 63, + "id": 64, "panels": [], "title": "Snapshots", "type": "row" @@ -8702,9 +9379,9 @@ "h": 6, "w": 8, "x": 0, - "y": 98 + "y": 110 }, - "id": 64, + "id": 65, "legend": { "avg": false, "current": false, @@ -8894,9 +9571,9 @@ "h": 6, "w": 8, "x": 8, - "y": 98 + "y": 110 }, - "id": 65, + "id": 66, "legend": { "avg": false, "current": false, @@ -9086,9 +9763,9 @@ "h": 6, "w": 8, "x": 16, - "y": 98 + "y": 110 }, - "id": 66, + "id": 67, "legend": { "avg": false, "current": false, @@ -9278,6 +9955,7 @@ "list": [ { "current": { + "selected": true, "text": "$datasource", "value": "$datasource" }, @@ -9366,5 +10044,5 @@ "timezone": "", "title": "Testnet Monitor (edge)", "uid": "testnet-edge", - "version": 1 -} + "version": 6 +} \ No newline at end of file