ReplayStage metrics (#6358)
* ReplayStage metrics * Add more metrics * Refactor get_slot_entries_with_shred_count() to detect wasted work * Update dashboard * Update broadcast slots to micros * Add broadcast dashboard
This commit is contained in:
@ -25,6 +25,7 @@ use std::path::{Path, PathBuf};
|
|||||||
use std::rc::Rc;
|
use std::rc::Rc;
|
||||||
use std::sync::mpsc::{sync_channel, Receiver, SyncSender, TrySendError};
|
use std::sync::mpsc::{sync_channel, Receiver, SyncSender, TrySendError};
|
||||||
use std::sync::{Arc, RwLock};
|
use std::sync::{Arc, RwLock};
|
||||||
|
use std::time::Instant;
|
||||||
|
|
||||||
pub use self::meta::*;
|
pub use self::meta::*;
|
||||||
pub use self::rooted_slot_iterator::*;
|
pub use self::rooted_slot_iterator::*;
|
||||||
@ -981,65 +982,73 @@ impl Blocktree {
|
|||||||
&self,
|
&self,
|
||||||
slot: u64,
|
slot: u64,
|
||||||
mut start_index: u64,
|
mut start_index: u64,
|
||||||
) -> Result<(Vec<Entry>, usize)> {
|
) -> Result<(Vec<Entry>, usize, u64, u64)> {
|
||||||
// Find the next consecutive block of shreds.
|
let mut useful_time = 0;
|
||||||
let mut serialized_shreds: Vec<Vec<u8>> = vec![];
|
let mut wasted_time = 0;
|
||||||
let data_shred_cf = self.db.column::<cf::ShredData>();
|
|
||||||
|
|
||||||
while let Some(serialized_shred) = data_shred_cf.get_bytes((slot, start_index))? {
|
|
||||||
serialized_shreds.push(serialized_shred);
|
|
||||||
start_index += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
trace!(
|
|
||||||
"Found {:?} shreds for slot {:?}",
|
|
||||||
serialized_shreds.len(),
|
|
||||||
slot
|
|
||||||
);
|
|
||||||
|
|
||||||
let mut shreds: Vec<Shred> = serialized_shreds
|
|
||||||
.into_iter()
|
|
||||||
.filter_map(|serialized_shred| Shred::new_from_serialized_shred(serialized_shred).ok())
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
let mut all_entries = vec![];
|
let mut all_entries = vec![];
|
||||||
let mut num = 0;
|
let mut num_shreds = 0;
|
||||||
loop {
|
loop {
|
||||||
let mut look_for_last_shred = true;
|
let now = Instant::now();
|
||||||
|
let mut res = self.get_entries_in_data_block(slot, &mut start_index);
|
||||||
|
let elapsed = now.elapsed().as_micros();
|
||||||
|
|
||||||
let mut shred_chunk = vec![];
|
if let Ok((ref mut entries, new_num_shreds)) = res {
|
||||||
while look_for_last_shred && !shreds.is_empty() {
|
if !entries.is_empty() {
|
||||||
let shred = shreds.remove(0);
|
all_entries.append(entries);
|
||||||
if shred.data_complete() || shred.last_in_slot() {
|
num_shreds += new_num_shreds;
|
||||||
look_for_last_shred = false;
|
useful_time += elapsed;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
shred_chunk.push(shred);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
debug!(
|
// All unsuccessful cases (errors, incomplete data blocks) will count as wasted work
|
||||||
"{:?} shreds in last FEC set. Looking for last shred {:?}",
|
wasted_time += elapsed;
|
||||||
shred_chunk.len(),
|
res?;
|
||||||
look_for_last_shred
|
|
||||||
);
|
|
||||||
|
|
||||||
// Break if we didn't find the last shred (as more data is required)
|
|
||||||
if look_for_last_shred {
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
trace!("Found {:?} entries", all_entries.len());
|
||||||
|
Ok((
|
||||||
|
all_entries,
|
||||||
|
num_shreds,
|
||||||
|
useful_time as u64,
|
||||||
|
wasted_time as u64,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_entries_in_data_block(
|
||||||
|
&self,
|
||||||
|
slot: u64,
|
||||||
|
start_index: &mut u64,
|
||||||
|
) -> Result<(Vec<Entry>, usize)> {
|
||||||
|
let mut shred_chunk: Vec<Shred> = vec![];
|
||||||
|
let data_shred_cf = self.db.column::<cf::ShredData>();
|
||||||
|
while let Some(serialized_shred) = data_shred_cf.get_bytes((slot, *start_index))? {
|
||||||
|
*start_index += 1;
|
||||||
|
let new_shred = Shred::new_from_serialized_shred(serialized_shred).ok();
|
||||||
|
if let Some(shred) = new_shred {
|
||||||
|
let is_complete = shred.data_complete() || shred.last_in_slot();
|
||||||
|
shred_chunk.push(shred);
|
||||||
|
if is_complete {
|
||||||
if let Ok(deshred_payload) = Shredder::deshred(&shred_chunk) {
|
if let Ok(deshred_payload) = Shredder::deshred(&shred_chunk) {
|
||||||
|
debug!("{:?} shreds in last FEC set", shred_chunk.len(),);
|
||||||
let entries: Vec<Entry> = bincode::deserialize(&deshred_payload)?;
|
let entries: Vec<Entry> = bincode::deserialize(&deshred_payload)?;
|
||||||
trace!("Found entries: {:#?}", entries);
|
return Ok((entries, shred_chunk.len()));
|
||||||
all_entries.extend(entries);
|
|
||||||
num += shred_chunk.len();
|
|
||||||
} else {
|
} else {
|
||||||
debug!("Failed in deshredding shred payloads");
|
debug!("Failed in deshredding shred payloads");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// Didn't find a valid shred, this slot is dead.
|
||||||
|
// TODO: Mark as dead, but have to carefully handle last shred of interrupted
|
||||||
|
// slots.
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
trace!("Found {:?} entries", all_entries.len());
|
Ok((vec![], 0))
|
||||||
Ok((all_entries, num))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns slots connecting to any element of the list `slots`.
|
// Returns slots connecting to any element of the list `slots`.
|
||||||
|
@ -298,7 +298,7 @@ mod test {
|
|||||||
);
|
);
|
||||||
|
|
||||||
let blocktree = broadcast_service.blocktree;
|
let blocktree = broadcast_service.blocktree;
|
||||||
let (entries, _) = blocktree
|
let (entries, _, _, _) = blocktree
|
||||||
.get_slot_entries_with_shred_count(slot, 0)
|
.get_slot_entries_with_shred_count(slot, 0)
|
||||||
.expect("Expect entries to be present");
|
.expect("Expect entries to be present");
|
||||||
assert_eq!(entries.len(), max_tick_height as usize);
|
assert_eq!(entries.len(), max_tick_height as usize);
|
||||||
|
@ -270,7 +270,7 @@ impl StandardBroadcastRun {
|
|||||||
),
|
),
|
||||||
(
|
(
|
||||||
"slot_broadcast_time",
|
"slot_broadcast_time",
|
||||||
self.slot_broadcast_start.unwrap().elapsed().as_millis() as i64,
|
self.slot_broadcast_start.unwrap().elapsed().as_micros() as i64,
|
||||||
i64
|
i64
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
|
@ -59,20 +59,73 @@ pub struct ReplayStage {
|
|||||||
confidence_service: AggregateConfidenceService,
|
confidence_service: AggregateConfidenceService,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default)]
|
struct ReplaySlotStats {
|
||||||
|
// Per-slot elapsed time
|
||||||
|
slot: u64,
|
||||||
|
fetch_entries_elapsed: u64,
|
||||||
|
fetch_entries_fail_elapsed: u64,
|
||||||
|
entry_verification_elapsed: u64,
|
||||||
|
replay_elapsed: u64,
|
||||||
|
replay_start: Instant,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ReplaySlotStats {
|
||||||
|
pub fn new(slot: u64) -> Self {
|
||||||
|
Self {
|
||||||
|
slot,
|
||||||
|
fetch_entries_elapsed: 0,
|
||||||
|
fetch_entries_fail_elapsed: 0,
|
||||||
|
entry_verification_elapsed: 0,
|
||||||
|
replay_elapsed: 0,
|
||||||
|
replay_start: Instant::now(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn report_stats(&self, total_entries: usize, total_shreds: usize) {
|
||||||
|
datapoint_info!(
|
||||||
|
"replay-slot-stats",
|
||||||
|
("slot", self.slot as i64, i64),
|
||||||
|
("fetch_entries_time", self.fetch_entries_elapsed as i64, i64),
|
||||||
|
(
|
||||||
|
"fetch_entries_fail_time",
|
||||||
|
self.fetch_entries_fail_elapsed as i64,
|
||||||
|
i64
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"entry_verification_time",
|
||||||
|
self.entry_verification_elapsed as i64,
|
||||||
|
i64
|
||||||
|
),
|
||||||
|
("replay_time", self.replay_elapsed as i64, i64),
|
||||||
|
(
|
||||||
|
"replay_total_elapsed",
|
||||||
|
self.replay_start.elapsed().as_micros() as i64,
|
||||||
|
i64
|
||||||
|
),
|
||||||
|
("total_entries", total_entries as i64, i64),
|
||||||
|
("total_shreds", total_shreds as i64, i64),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct ForkProgress {
|
struct ForkProgress {
|
||||||
last_entry: Hash,
|
last_entry: Hash,
|
||||||
num_blobs: usize,
|
num_shreds: usize,
|
||||||
|
num_entries: usize,
|
||||||
started_ms: u64,
|
started_ms: u64,
|
||||||
is_dead: bool,
|
is_dead: bool,
|
||||||
|
stats: ReplaySlotStats,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ForkProgress {
|
impl ForkProgress {
|
||||||
pub fn new(last_entry: Hash) -> Self {
|
pub fn new(slot: u64, last_entry: Hash) -> Self {
|
||||||
Self {
|
Self {
|
||||||
last_entry,
|
last_entry,
|
||||||
num_blobs: 0,
|
num_shreds: 0,
|
||||||
|
num_entries: 0,
|
||||||
started_ms: timing::timestamp(),
|
started_ms: timing::timestamp(),
|
||||||
is_dead: false,
|
is_dead: false,
|
||||||
|
stats: ReplaySlotStats::new(slot),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -369,24 +422,41 @@ impl ReplayStage {
|
|||||||
progress: &mut HashMap<u64, ForkProgress>,
|
progress: &mut HashMap<u64, ForkProgress>,
|
||||||
) -> (Result<()>, usize) {
|
) -> (Result<()>, usize) {
|
||||||
let mut tx_count = 0;
|
let mut tx_count = 0;
|
||||||
let result =
|
let bank_progress = &mut progress
|
||||||
Self::load_blocktree_entries(bank, blocktree, progress).and_then(|(entries, num)| {
|
.entry(bank.slot())
|
||||||
debug!("Replaying {:?} entries, num {:?}", entries.len(), num);
|
.or_insert_with(|| ForkProgress::new(bank.slot(), bank.last_blockhash()));
|
||||||
|
let now = Instant::now();
|
||||||
|
let load_result = Self::load_blocktree_entries(bank, blocktree, bank_progress);
|
||||||
|
let fetch_entries_elapsed = now.elapsed().as_micros();
|
||||||
|
|
||||||
|
if load_result.is_err() {
|
||||||
|
bank_progress.stats.fetch_entries_fail_elapsed += fetch_entries_elapsed as u64;
|
||||||
|
}
|
||||||
|
let replay_result =
|
||||||
|
load_result.and_then(|(entries, num_shreds, useful_time, wasted_time)| {
|
||||||
|
trace!(
|
||||||
|
"Fetch entries for slot {}, {:?} entries, num shreds {:?}",
|
||||||
|
bank.slot(),
|
||||||
|
entries.len(),
|
||||||
|
num_shreds
|
||||||
|
);
|
||||||
tx_count += entries.iter().map(|e| e.transactions.len()).sum::<usize>();
|
tx_count += entries.iter().map(|e| e.transactions.len()).sum::<usize>();
|
||||||
Self::replay_entries_into_bank(bank, entries, progress, num)
|
bank_progress.stats.fetch_entries_elapsed += useful_time as u64;
|
||||||
|
bank_progress.stats.fetch_entries_fail_elapsed += wasted_time as u64;
|
||||||
|
Self::replay_entries_into_bank(bank, entries, bank_progress, num_shreds)
|
||||||
});
|
});
|
||||||
|
|
||||||
if Self::is_replay_result_fatal(&result) {
|
if Self::is_replay_result_fatal(&replay_result) {
|
||||||
warn!(
|
warn!(
|
||||||
"Fatal replay result in slot: {}, result: {:?}",
|
"Fatal replay result in slot: {}, result: {:?}",
|
||||||
bank.slot(),
|
bank.slot(),
|
||||||
result
|
replay_result
|
||||||
);
|
);
|
||||||
datapoint_warn!("replay-stage-mark_dead_slot", ("slot", bank.slot(), i64),);
|
datapoint_warn!("replay-stage-mark_dead_slot", ("slot", bank.slot(), i64),);
|
||||||
Self::mark_dead_slot(bank.slot(), blocktree, progress);
|
Self::mark_dead_slot(bank.slot(), blocktree, progress);
|
||||||
}
|
}
|
||||||
|
|
||||||
(result, tx_count)
|
(replay_result, tx_count)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn mark_dead_slot(slot: u64, blocktree: &Blocktree, progress: &mut HashMap<u64, ForkProgress>) {
|
fn mark_dead_slot(slot: u64, blocktree: &Blocktree, progress: &mut HashMap<u64, ForkProgress>) {
|
||||||
@ -542,6 +612,11 @@ impl ReplayStage {
|
|||||||
}
|
}
|
||||||
assert_eq!(*bank_slot, bank.slot());
|
assert_eq!(*bank_slot, bank.slot());
|
||||||
if bank.tick_height() == bank.max_tick_height() {
|
if bank.tick_height() == bank.max_tick_height() {
|
||||||
|
if let Some(bank_progress) = &mut progress.get(&bank.slot()) {
|
||||||
|
bank_progress
|
||||||
|
.stats
|
||||||
|
.report_stats(bank_progress.num_entries, bank_progress.num_shreds);
|
||||||
|
}
|
||||||
did_complete_bank = true;
|
did_complete_bank = true;
|
||||||
Self::process_completed_bank(my_pubkey, bank, slot_full_senders);
|
Self::process_completed_bank(my_pubkey, bank, slot_full_senders);
|
||||||
} else {
|
} else {
|
||||||
@ -665,31 +740,26 @@ impl ReplayStage {
|
|||||||
fn load_blocktree_entries(
|
fn load_blocktree_entries(
|
||||||
bank: &Bank,
|
bank: &Bank,
|
||||||
blocktree: &Blocktree,
|
blocktree: &Blocktree,
|
||||||
progress: &mut HashMap<u64, ForkProgress>,
|
bank_progress: &mut ForkProgress,
|
||||||
) -> Result<(Vec<Entry>, usize)> {
|
) -> Result<(Vec<Entry>, usize, u64, u64)> {
|
||||||
let bank_slot = bank.slot();
|
let bank_slot = bank.slot();
|
||||||
let bank_progress = &mut progress
|
blocktree.get_slot_entries_with_shred_count(bank_slot, bank_progress.num_shreds as u64)
|
||||||
.entry(bank_slot)
|
|
||||||
.or_insert_with(|| ForkProgress::new(bank.last_blockhash()));
|
|
||||||
blocktree.get_slot_entries_with_shred_count(bank_slot, bank_progress.num_blobs as u64)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn replay_entries_into_bank(
|
fn replay_entries_into_bank(
|
||||||
bank: &Arc<Bank>,
|
bank: &Arc<Bank>,
|
||||||
entries: Vec<Entry>,
|
entries: Vec<Entry>,
|
||||||
progress: &mut HashMap<u64, ForkProgress>,
|
bank_progress: &mut ForkProgress,
|
||||||
num: usize,
|
num: usize,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let bank_progress = &mut progress
|
|
||||||
.entry(bank.slot())
|
|
||||||
.or_insert_with(|| ForkProgress::new(bank.last_blockhash()));
|
|
||||||
let result = Self::verify_and_process_entries(
|
let result = Self::verify_and_process_entries(
|
||||||
&bank,
|
&bank,
|
||||||
&entries,
|
&entries,
|
||||||
&bank_progress.last_entry,
|
bank_progress.num_shreds,
|
||||||
bank_progress.num_blobs,
|
bank_progress,
|
||||||
);
|
);
|
||||||
bank_progress.num_blobs += num;
|
bank_progress.num_shreds += num;
|
||||||
|
bank_progress.num_entries += entries.len();
|
||||||
if let Some(last_entry) = entries.last() {
|
if let Some(last_entry) = entries.last() {
|
||||||
bank_progress.last_entry = last_entry.hash;
|
bank_progress.last_entry = last_entry.hash;
|
||||||
}
|
}
|
||||||
@ -697,15 +767,21 @@ impl ReplayStage {
|
|||||||
result
|
result
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn verify_and_process_entries(
|
fn verify_and_process_entries(
|
||||||
bank: &Arc<Bank>,
|
bank: &Arc<Bank>,
|
||||||
entries: &[Entry],
|
entries: &[Entry],
|
||||||
last_entry: &Hash,
|
|
||||||
shred_index: usize,
|
shred_index: usize,
|
||||||
|
bank_progress: &mut ForkProgress,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
if !entries.verify(last_entry) {
|
let now = Instant::now();
|
||||||
warn!(
|
let last_entry = &bank_progress.last_entry;
|
||||||
"entry verification failed {} {} {} {} {}",
|
let verify_result = entries.verify(last_entry);
|
||||||
|
let verify_entries_elapsed = now.elapsed().as_micros();
|
||||||
|
bank_progress.stats.entry_verification_elapsed += verify_entries_elapsed as u64;
|
||||||
|
if !verify_result {
|
||||||
|
info!(
|
||||||
|
"entry verification failed, slot: {}, entry len: {}, tick_height: {}, last entry: {}, last_blockhash: {}, shred_index: {}",
|
||||||
|
bank.slot(),
|
||||||
entries.len(),
|
entries.len(),
|
||||||
bank.tick_height(),
|
bank.tick_height(),
|
||||||
last_entry,
|
last_entry,
|
||||||
@ -720,8 +796,13 @@ impl ReplayStage {
|
|||||||
);
|
);
|
||||||
return Err(Error::BlobError(BlobError::VerificationFailed));
|
return Err(Error::BlobError(BlobError::VerificationFailed));
|
||||||
}
|
}
|
||||||
blocktree_processor::process_entries(bank, entries, true)?;
|
|
||||||
|
|
||||||
|
let now = Instant::now();
|
||||||
|
let res = blocktree_processor::process_entries(bank, entries, true);
|
||||||
|
let replay_elapsed = now.elapsed().as_micros();
|
||||||
|
bank_progress.stats.replay_elapsed += replay_elapsed as u64;
|
||||||
|
|
||||||
|
res?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -859,7 +940,7 @@ mod test {
|
|||||||
let bank0 = Bank::new(&genesis_block);
|
let bank0 = Bank::new(&genesis_block);
|
||||||
let bank_forks = Arc::new(RwLock::new(BankForks::new(0, bank0)));
|
let bank_forks = Arc::new(RwLock::new(BankForks::new(0, bank0)));
|
||||||
let mut progress = HashMap::new();
|
let mut progress = HashMap::new();
|
||||||
progress.insert(5, ForkProgress::new(Hash::default()));
|
progress.insert(5, ForkProgress::new(0, Hash::default()));
|
||||||
ReplayStage::handle_new_root(&bank_forks, &mut progress);
|
ReplayStage::handle_new_root(&bank_forks, &mut progress);
|
||||||
assert!(progress.is_empty());
|
assert!(progress.is_empty());
|
||||||
}
|
}
|
||||||
@ -963,7 +1044,7 @@ mod test {
|
|||||||
let bank0 = Arc::new(Bank::new(&genesis_block));
|
let bank0 = Arc::new(Bank::new(&genesis_block));
|
||||||
let mut progress = HashMap::new();
|
let mut progress = HashMap::new();
|
||||||
let last_blockhash = bank0.last_blockhash();
|
let last_blockhash = bank0.last_blockhash();
|
||||||
progress.insert(bank0.slot(), ForkProgress::new(last_blockhash));
|
progress.insert(bank0.slot(), ForkProgress::new(0, last_blockhash));
|
||||||
let shreds = shred_to_insert(&last_blockhash, bank0.slot());
|
let shreds = shred_to_insert(&last_blockhash, bank0.slot());
|
||||||
blocktree.insert_shreds(shreds, None).unwrap();
|
blocktree.insert_shreds(shreds, None).unwrap();
|
||||||
let (res, _tx_count) =
|
let (res, _tx_count) =
|
||||||
|
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user