ReplayStage metrics (#6358)

* ReplayStage metrics

* Add more metrics

* Refactor get_slot_entries_with_shred_count() to detect wasted work

* Update dashboard

* Update broadcast slots to micros

* Add broadcast dashboard
This commit is contained in:
carllin
2019-10-16 14:32:18 -07:00
committed by GitHub
parent 2d351d3952
commit ccb4e32ee0
5 changed files with 1053 additions and 285 deletions

View File

@ -25,6 +25,7 @@ use std::path::{Path, PathBuf};
use std::rc::Rc; use std::rc::Rc;
use std::sync::mpsc::{sync_channel, Receiver, SyncSender, TrySendError}; use std::sync::mpsc::{sync_channel, Receiver, SyncSender, TrySendError};
use std::sync::{Arc, RwLock}; use std::sync::{Arc, RwLock};
use std::time::Instant;
pub use self::meta::*; pub use self::meta::*;
pub use self::rooted_slot_iterator::*; pub use self::rooted_slot_iterator::*;
@ -981,65 +982,73 @@ impl Blocktree {
&self, &self,
slot: u64, slot: u64,
mut start_index: u64, mut start_index: u64,
) -> Result<(Vec<Entry>, usize)> { ) -> Result<(Vec<Entry>, usize, u64, u64)> {
// Find the next consecutive block of shreds. let mut useful_time = 0;
let mut serialized_shreds: Vec<Vec<u8>> = vec![]; let mut wasted_time = 0;
let data_shred_cf = self.db.column::<cf::ShredData>();
while let Some(serialized_shred) = data_shred_cf.get_bytes((slot, start_index))? {
serialized_shreds.push(serialized_shred);
start_index += 1;
}
trace!(
"Found {:?} shreds for slot {:?}",
serialized_shreds.len(),
slot
);
let mut shreds: Vec<Shred> = serialized_shreds
.into_iter()
.filter_map(|serialized_shred| Shred::new_from_serialized_shred(serialized_shred).ok())
.collect();
let mut all_entries = vec![]; let mut all_entries = vec![];
let mut num = 0; let mut num_shreds = 0;
loop { loop {
let mut look_for_last_shred = true; let now = Instant::now();
let mut res = self.get_entries_in_data_block(slot, &mut start_index);
let elapsed = now.elapsed().as_micros();
let mut shred_chunk = vec![]; if let Ok((ref mut entries, new_num_shreds)) = res {
while look_for_last_shred && !shreds.is_empty() { if !entries.is_empty() {
let shred = shreds.remove(0); all_entries.append(entries);
if shred.data_complete() || shred.last_in_slot() { num_shreds += new_num_shreds;
look_for_last_shred = false; useful_time += elapsed;
continue;
} }
shred_chunk.push(shred);
} }
debug!( // All unsuccessful cases (errors, incomplete data blocks) will count as wasted work
"{:?} shreds in last FEC set. Looking for last shred {:?}", wasted_time += elapsed;
shred_chunk.len(), res?;
look_for_last_shred
);
// Break if we didn't find the last shred (as more data is required)
if look_for_last_shred {
break; break;
} }
trace!("Found {:?} entries", all_entries.len());
Ok((
all_entries,
num_shreds,
useful_time as u64,
wasted_time as u64,
))
}
pub fn get_entries_in_data_block(
&self,
slot: u64,
start_index: &mut u64,
) -> Result<(Vec<Entry>, usize)> {
let mut shred_chunk: Vec<Shred> = vec![];
let data_shred_cf = self.db.column::<cf::ShredData>();
while let Some(serialized_shred) = data_shred_cf.get_bytes((slot, *start_index))? {
*start_index += 1;
let new_shred = Shred::new_from_serialized_shred(serialized_shred).ok();
if let Some(shred) = new_shred {
let is_complete = shred.data_complete() || shred.last_in_slot();
shred_chunk.push(shred);
if is_complete {
if let Ok(deshred_payload) = Shredder::deshred(&shred_chunk) { if let Ok(deshred_payload) = Shredder::deshred(&shred_chunk) {
debug!("{:?} shreds in last FEC set", shred_chunk.len(),);
let entries: Vec<Entry> = bincode::deserialize(&deshred_payload)?; let entries: Vec<Entry> = bincode::deserialize(&deshred_payload)?;
trace!("Found entries: {:#?}", entries); return Ok((entries, shred_chunk.len()));
all_entries.extend(entries);
num += shred_chunk.len();
} else { } else {
debug!("Failed in deshredding shred payloads"); debug!("Failed in deshredding shred payloads");
break; break;
} }
} }
} else {
// Didn't find a valid shred, this slot is dead.
// TODO: Mark as dead, but have to carefully handle last shred of interrupted
// slots.
break;
}
}
trace!("Found {:?} entries", all_entries.len()); Ok((vec![], 0))
Ok((all_entries, num))
} }
// Returns slots connecting to any element of the list `slots`. // Returns slots connecting to any element of the list `slots`.

View File

@ -298,7 +298,7 @@ mod test {
); );
let blocktree = broadcast_service.blocktree; let blocktree = broadcast_service.blocktree;
let (entries, _) = blocktree let (entries, _, _, _) = blocktree
.get_slot_entries_with_shred_count(slot, 0) .get_slot_entries_with_shred_count(slot, 0)
.expect("Expect entries to be present"); .expect("Expect entries to be present");
assert_eq!(entries.len(), max_tick_height as usize); assert_eq!(entries.len(), max_tick_height as usize);

View File

@ -270,7 +270,7 @@ impl StandardBroadcastRun {
), ),
( (
"slot_broadcast_time", "slot_broadcast_time",
self.slot_broadcast_start.unwrap().elapsed().as_millis() as i64, self.slot_broadcast_start.unwrap().elapsed().as_micros() as i64,
i64 i64
), ),
); );

View File

@ -59,20 +59,73 @@ pub struct ReplayStage {
confidence_service: AggregateConfidenceService, confidence_service: AggregateConfidenceService,
} }
#[derive(Default)] struct ReplaySlotStats {
// Per-slot elapsed time
slot: u64,
fetch_entries_elapsed: u64,
fetch_entries_fail_elapsed: u64,
entry_verification_elapsed: u64,
replay_elapsed: u64,
replay_start: Instant,
}
impl ReplaySlotStats {
pub fn new(slot: u64) -> Self {
Self {
slot,
fetch_entries_elapsed: 0,
fetch_entries_fail_elapsed: 0,
entry_verification_elapsed: 0,
replay_elapsed: 0,
replay_start: Instant::now(),
}
}
pub fn report_stats(&self, total_entries: usize, total_shreds: usize) {
datapoint_info!(
"replay-slot-stats",
("slot", self.slot as i64, i64),
("fetch_entries_time", self.fetch_entries_elapsed as i64, i64),
(
"fetch_entries_fail_time",
self.fetch_entries_fail_elapsed as i64,
i64
),
(
"entry_verification_time",
self.entry_verification_elapsed as i64,
i64
),
("replay_time", self.replay_elapsed as i64, i64),
(
"replay_total_elapsed",
self.replay_start.elapsed().as_micros() as i64,
i64
),
("total_entries", total_entries as i64, i64),
("total_shreds", total_shreds as i64, i64),
);
}
}
struct ForkProgress { struct ForkProgress {
last_entry: Hash, last_entry: Hash,
num_blobs: usize, num_shreds: usize,
num_entries: usize,
started_ms: u64, started_ms: u64,
is_dead: bool, is_dead: bool,
stats: ReplaySlotStats,
} }
impl ForkProgress { impl ForkProgress {
pub fn new(last_entry: Hash) -> Self { pub fn new(slot: u64, last_entry: Hash) -> Self {
Self { Self {
last_entry, last_entry,
num_blobs: 0, num_shreds: 0,
num_entries: 0,
started_ms: timing::timestamp(), started_ms: timing::timestamp(),
is_dead: false, is_dead: false,
stats: ReplaySlotStats::new(slot),
} }
} }
} }
@ -369,24 +422,41 @@ impl ReplayStage {
progress: &mut HashMap<u64, ForkProgress>, progress: &mut HashMap<u64, ForkProgress>,
) -> (Result<()>, usize) { ) -> (Result<()>, usize) {
let mut tx_count = 0; let mut tx_count = 0;
let result = let bank_progress = &mut progress
Self::load_blocktree_entries(bank, blocktree, progress).and_then(|(entries, num)| { .entry(bank.slot())
debug!("Replaying {:?} entries, num {:?}", entries.len(), num); .or_insert_with(|| ForkProgress::new(bank.slot(), bank.last_blockhash()));
let now = Instant::now();
let load_result = Self::load_blocktree_entries(bank, blocktree, bank_progress);
let fetch_entries_elapsed = now.elapsed().as_micros();
if load_result.is_err() {
bank_progress.stats.fetch_entries_fail_elapsed += fetch_entries_elapsed as u64;
}
let replay_result =
load_result.and_then(|(entries, num_shreds, useful_time, wasted_time)| {
trace!(
"Fetch entries for slot {}, {:?} entries, num shreds {:?}",
bank.slot(),
entries.len(),
num_shreds
);
tx_count += entries.iter().map(|e| e.transactions.len()).sum::<usize>(); tx_count += entries.iter().map(|e| e.transactions.len()).sum::<usize>();
Self::replay_entries_into_bank(bank, entries, progress, num) bank_progress.stats.fetch_entries_elapsed += useful_time as u64;
bank_progress.stats.fetch_entries_fail_elapsed += wasted_time as u64;
Self::replay_entries_into_bank(bank, entries, bank_progress, num_shreds)
}); });
if Self::is_replay_result_fatal(&result) { if Self::is_replay_result_fatal(&replay_result) {
warn!( warn!(
"Fatal replay result in slot: {}, result: {:?}", "Fatal replay result in slot: {}, result: {:?}",
bank.slot(), bank.slot(),
result replay_result
); );
datapoint_warn!("replay-stage-mark_dead_slot", ("slot", bank.slot(), i64),); datapoint_warn!("replay-stage-mark_dead_slot", ("slot", bank.slot(), i64),);
Self::mark_dead_slot(bank.slot(), blocktree, progress); Self::mark_dead_slot(bank.slot(), blocktree, progress);
} }
(result, tx_count) (replay_result, tx_count)
} }
fn mark_dead_slot(slot: u64, blocktree: &Blocktree, progress: &mut HashMap<u64, ForkProgress>) { fn mark_dead_slot(slot: u64, blocktree: &Blocktree, progress: &mut HashMap<u64, ForkProgress>) {
@ -542,6 +612,11 @@ impl ReplayStage {
} }
assert_eq!(*bank_slot, bank.slot()); assert_eq!(*bank_slot, bank.slot());
if bank.tick_height() == bank.max_tick_height() { if bank.tick_height() == bank.max_tick_height() {
if let Some(bank_progress) = &mut progress.get(&bank.slot()) {
bank_progress
.stats
.report_stats(bank_progress.num_entries, bank_progress.num_shreds);
}
did_complete_bank = true; did_complete_bank = true;
Self::process_completed_bank(my_pubkey, bank, slot_full_senders); Self::process_completed_bank(my_pubkey, bank, slot_full_senders);
} else { } else {
@ -665,31 +740,26 @@ impl ReplayStage {
fn load_blocktree_entries( fn load_blocktree_entries(
bank: &Bank, bank: &Bank,
blocktree: &Blocktree, blocktree: &Blocktree,
progress: &mut HashMap<u64, ForkProgress>, bank_progress: &mut ForkProgress,
) -> Result<(Vec<Entry>, usize)> { ) -> Result<(Vec<Entry>, usize, u64, u64)> {
let bank_slot = bank.slot(); let bank_slot = bank.slot();
let bank_progress = &mut progress blocktree.get_slot_entries_with_shred_count(bank_slot, bank_progress.num_shreds as u64)
.entry(bank_slot)
.or_insert_with(|| ForkProgress::new(bank.last_blockhash()));
blocktree.get_slot_entries_with_shred_count(bank_slot, bank_progress.num_blobs as u64)
} }
fn replay_entries_into_bank( fn replay_entries_into_bank(
bank: &Arc<Bank>, bank: &Arc<Bank>,
entries: Vec<Entry>, entries: Vec<Entry>,
progress: &mut HashMap<u64, ForkProgress>, bank_progress: &mut ForkProgress,
num: usize, num: usize,
) -> Result<()> { ) -> Result<()> {
let bank_progress = &mut progress
.entry(bank.slot())
.or_insert_with(|| ForkProgress::new(bank.last_blockhash()));
let result = Self::verify_and_process_entries( let result = Self::verify_and_process_entries(
&bank, &bank,
&entries, &entries,
&bank_progress.last_entry, bank_progress.num_shreds,
bank_progress.num_blobs, bank_progress,
); );
bank_progress.num_blobs += num; bank_progress.num_shreds += num;
bank_progress.num_entries += entries.len();
if let Some(last_entry) = entries.last() { if let Some(last_entry) = entries.last() {
bank_progress.last_entry = last_entry.hash; bank_progress.last_entry = last_entry.hash;
} }
@ -697,15 +767,21 @@ impl ReplayStage {
result result
} }
pub fn verify_and_process_entries( fn verify_and_process_entries(
bank: &Arc<Bank>, bank: &Arc<Bank>,
entries: &[Entry], entries: &[Entry],
last_entry: &Hash,
shred_index: usize, shred_index: usize,
bank_progress: &mut ForkProgress,
) -> Result<()> { ) -> Result<()> {
if !entries.verify(last_entry) { let now = Instant::now();
warn!( let last_entry = &bank_progress.last_entry;
"entry verification failed {} {} {} {} {}", let verify_result = entries.verify(last_entry);
let verify_entries_elapsed = now.elapsed().as_micros();
bank_progress.stats.entry_verification_elapsed += verify_entries_elapsed as u64;
if !verify_result {
info!(
"entry verification failed, slot: {}, entry len: {}, tick_height: {}, last entry: {}, last_blockhash: {}, shred_index: {}",
bank.slot(),
entries.len(), entries.len(),
bank.tick_height(), bank.tick_height(),
last_entry, last_entry,
@ -720,8 +796,13 @@ impl ReplayStage {
); );
return Err(Error::BlobError(BlobError::VerificationFailed)); return Err(Error::BlobError(BlobError::VerificationFailed));
} }
blocktree_processor::process_entries(bank, entries, true)?;
let now = Instant::now();
let res = blocktree_processor::process_entries(bank, entries, true);
let replay_elapsed = now.elapsed().as_micros();
bank_progress.stats.replay_elapsed += replay_elapsed as u64;
res?;
Ok(()) Ok(())
} }
@ -859,7 +940,7 @@ mod test {
let bank0 = Bank::new(&genesis_block); let bank0 = Bank::new(&genesis_block);
let bank_forks = Arc::new(RwLock::new(BankForks::new(0, bank0))); let bank_forks = Arc::new(RwLock::new(BankForks::new(0, bank0)));
let mut progress = HashMap::new(); let mut progress = HashMap::new();
progress.insert(5, ForkProgress::new(Hash::default())); progress.insert(5, ForkProgress::new(0, Hash::default()));
ReplayStage::handle_new_root(&bank_forks, &mut progress); ReplayStage::handle_new_root(&bank_forks, &mut progress);
assert!(progress.is_empty()); assert!(progress.is_empty());
} }
@ -963,7 +1044,7 @@ mod test {
let bank0 = Arc::new(Bank::new(&genesis_block)); let bank0 = Arc::new(Bank::new(&genesis_block));
let mut progress = HashMap::new(); let mut progress = HashMap::new();
let last_blockhash = bank0.last_blockhash(); let last_blockhash = bank0.last_blockhash();
progress.insert(bank0.slot(), ForkProgress::new(last_blockhash)); progress.insert(bank0.slot(), ForkProgress::new(0, last_blockhash));
let shreds = shred_to_insert(&last_blockhash, bank0.slot()); let shreds = shred_to_insert(&last_blockhash, bank0.slot());
blocktree.insert_shreds(shreds, None).unwrap(); blocktree.insert_shreds(shreds, None).unwrap();
let (res, _tx_count) = let (res, _tx_count) =