Separate out interrupted slots broadcast metrics (#20537)

This commit is contained in:
carllin
2021-10-09 01:46:06 -07:00
committed by GitHub
parent db9336c99e
commit 838ff3b871
3 changed files with 131 additions and 71 deletions

View File

@ -89,6 +89,7 @@ impl BroadcastRun for BroadcastFakeShredsRun {
slot, slot,
num_expected_batches: None, num_expected_batches: None,
slot_start_ts: Instant::now(), slot_start_ts: Instant::now(),
was_interrupted: false,
}; };
// 3) Start broadcast step // 3) Start broadcast step
//some indicates fake shreds //some indicates fake shreds

View File

@ -2,7 +2,7 @@ use super::*;
pub(crate) trait BroadcastStats { pub(crate) trait BroadcastStats {
fn update(&mut self, new_stats: &Self); fn update(&mut self, new_stats: &Self);
fn report_stats(&mut self, slot: Slot, slot_start: Instant); fn report_stats(&mut self, slot: Slot, slot_start: Instant, was_interrupted: bool);
} }
#[derive(Clone)] #[derive(Clone)]
@ -10,6 +10,7 @@ pub(crate) struct BroadcastShredBatchInfo {
pub(crate) slot: Slot, pub(crate) slot: Slot,
pub(crate) num_expected_batches: Option<usize>, pub(crate) num_expected_batches: Option<usize>,
pub(crate) slot_start_ts: Instant, pub(crate) slot_start_ts: Instant,
pub(crate) was_interrupted: bool,
} }
#[derive(Default, Clone)] #[derive(Default, Clone)]
@ -33,7 +34,20 @@ impl BroadcastStats for TransmitShredsStats {
self.total_packets += new_stats.total_packets; self.total_packets += new_stats.total_packets;
self.dropped_packets += new_stats.dropped_packets; self.dropped_packets += new_stats.dropped_packets;
} }
fn report_stats(&mut self, slot: Slot, slot_start: Instant) { fn report_stats(&mut self, slot: Slot, slot_start: Instant, was_interrupted: bool) {
if was_interrupted {
datapoint_info!(
"broadcast-transmit-shreds-interrupted-stats",
("slot", slot as i64, i64),
("transmit_elapsed", self.transmit_elapsed as i64, i64),
("send_mmsg_elapsed", self.send_mmsg_elapsed as i64, i64),
("get_peers_elapsed", self.get_peers_elapsed as i64, i64),
("num_shreds", self.num_shreds as i64, i64),
("shred_select", self.shred_select as i64, i64),
("total_packets", self.total_packets as i64, i64),
("dropped_packets", self.dropped_packets as i64, i64),
);
} else {
datapoint_info!( datapoint_info!(
"broadcast-transmit-shreds-stats", "broadcast-transmit-shreds-stats",
("slot", slot as i64, i64), ("slot", slot as i64, i64),
@ -53,6 +67,7 @@ impl BroadcastStats for TransmitShredsStats {
("dropped_packets", self.dropped_packets as i64, i64), ("dropped_packets", self.dropped_packets as i64, i64),
); );
} }
}
} }
#[derive(Default, Clone)] #[derive(Default, Clone)]
@ -65,7 +80,19 @@ impl BroadcastStats for InsertShredsStats {
self.insert_shreds_elapsed += new_stats.insert_shreds_elapsed; self.insert_shreds_elapsed += new_stats.insert_shreds_elapsed;
self.num_shreds += new_stats.num_shreds; self.num_shreds += new_stats.num_shreds;
} }
fn report_stats(&mut self, slot: Slot, slot_start: Instant) { fn report_stats(&mut self, slot: Slot, slot_start: Instant, was_interrupted: bool) {
if was_interrupted {
datapoint_info!(
"broadcast-insert-shreds-interrupted-stats",
("slot", slot as i64, i64),
(
"insert_shreds_elapsed",
self.insert_shreds_elapsed as i64,
i64
),
("num_shreds", self.num_shreds as i64, i64),
);
} else {
datapoint_info!( datapoint_info!(
"broadcast-insert-shreds-stats", "broadcast-insert-shreds-stats",
("slot", slot as i64, i64), ("slot", slot as i64, i64),
@ -84,6 +111,7 @@ impl BroadcastStats for InsertShredsStats {
("num_shreds", self.num_shreds as i64, i64), ("num_shreds", self.num_shreds as i64, i64),
); );
} }
}
} }
// Tracks metrics of type `T` across multiple threads // Tracks metrics of type `T` across multiple threads
@ -128,9 +156,11 @@ impl<T: BroadcastStats + Default> SlotBroadcastStats<T> {
} }
if let Some(num_expected_batches) = slot_batch_counter.num_expected_batches { if let Some(num_expected_batches) = slot_batch_counter.num_expected_batches {
if slot_batch_counter.num_batches == num_expected_batches { if slot_batch_counter.num_batches == num_expected_batches {
slot_batch_counter slot_batch_counter.broadcast_shred_stats.report_stats(
.broadcast_shred_stats batch_info.slot,
.report_stats(batch_info.slot, batch_info.slot_start_ts); batch_info.slot_start_ts,
batch_info.was_interrupted,
);
should_delete = true; should_delete = true;
} }
} }
@ -159,7 +189,7 @@ mod test {
self.count += new_stats.count; self.count += new_stats.count;
self.sender = new_stats.sender.clone(); self.sender = new_stats.sender.clone();
} }
fn report_stats(&mut self, slot: Slot, slot_start: Instant) { fn report_stats(&mut self, slot: Slot, slot_start: Instant, _was_interrupted: bool) {
self.sender self.sender
.as_ref() .as_ref()
.unwrap() .unwrap()
@ -186,6 +216,7 @@ mod test {
slot: 0, slot: 0,
num_expected_batches: Some(2), num_expected_batches: Some(2),
slot_start_ts: start, slot_start_ts: start,
was_interrupted: false,
}), }),
); );
@ -242,6 +273,7 @@ mod test {
slot: 0, slot: 0,
num_expected_batches: None, num_expected_batches: None,
slot_start_ts: start, slot_start_ts: start,
was_interrupted: false,
}), }),
); );
@ -265,6 +297,7 @@ mod test {
slot, slot,
num_expected_batches: None, num_expected_batches: None,
slot_start_ts: start, slot_start_ts: start,
was_interrupted: false,
}; };
if i == round % num_threads { if i == round % num_threads {
broadcast_batch_info.num_expected_batches = Some(num_threads); broadcast_batch_info.num_expected_batches = Some(num_threads);

View File

@ -92,7 +92,7 @@ impl StandardBroadcastRun {
stats, stats,
); );
shreds.insert(0, shred); shreds.insert(0, shred);
self.report_and_reset_stats(); self.report_and_reset_stats(true);
self.unfinished_slot = None; self.unfinished_slot = None;
shreds shreds
} }
@ -240,6 +240,7 @@ impl StandardBroadcastRun {
"Old broadcast start time for previous slot must exist if the previous slot "Old broadcast start time for previous slot must exist if the previous slot
was interrupted", was interrupted",
), ),
was_interrupted: true,
}); });
let shreds = Arc::new(prev_slot_shreds); let shreds = Arc::new(prev_slot_shreds);
debug_assert!(shreds.iter().all(|shred| shred.slot() == slot)); debug_assert!(shreds.iter().all(|shred| shred.slot() == slot));
@ -262,6 +263,7 @@ impl StandardBroadcastRun {
slot_start_ts: self slot_start_ts: self
.slot_broadcast_start .slot_broadcast_start
.expect("Start timestamp must exist for a slot if we're broadcasting the slot"), .expect("Start timestamp must exist for a slot if we're broadcasting the slot"),
was_interrupted: false,
}); });
get_leader_schedule_time.stop(); get_leader_schedule_time.stop();
@ -297,7 +299,7 @@ impl StandardBroadcastRun {
self.process_shreds_stats.update(&process_stats); self.process_shreds_stats.update(&process_stats);
if last_tick_height == bank.max_tick_height() { if last_tick_height == bank.max_tick_height() {
self.report_and_reset_stats(); self.report_and_reset_stats(false);
self.unfinished_slot = None; self.unfinished_slot = None;
} }
@ -380,9 +382,32 @@ impl StandardBroadcastRun {
transmit_shreds_stats.update(new_transmit_shreds_stats, broadcast_shred_batch_info); transmit_shreds_stats.update(new_transmit_shreds_stats, broadcast_shred_batch_info);
} }
fn report_and_reset_stats(&mut self) { fn report_and_reset_stats(&mut self, was_interrupted: bool) {
let stats = &self.process_shreds_stats; let stats = &self.process_shreds_stats;
let unfinished_slot = self.unfinished_slot.as_ref().unwrap(); let unfinished_slot = self.unfinished_slot.as_ref().unwrap();
if was_interrupted {
datapoint_info!(
"broadcast-process-shreds-interrupted-stats",
("slot", unfinished_slot.slot as i64, i64),
("shredding_time", stats.shredding_elapsed, i64),
("receive_time", stats.receive_elapsed, i64),
(
"num_data_shreds",
unfinished_slot.next_shred_index as i64,
i64
),
(
"get_leader_schedule_time",
stats.get_leader_schedule_elapsed,
i64
),
("serialize_shreds_time", stats.serialize_elapsed, i64),
("gen_data_time", stats.gen_data_elapsed, i64),
("gen_coding_time", stats.gen_coding_elapsed, i64),
("sign_coding_time", stats.sign_coding_elapsed, i64),
("coding_send_time", stats.coding_send_elapsed, i64),
);
} else {
datapoint_info!( datapoint_info!(
"broadcast-process-shreds-stats", "broadcast-process-shreds-stats",
("slot", unfinished_slot.slot as i64, i64), ("slot", unfinished_slot.slot as i64, i64),
@ -409,6 +434,7 @@ impl StandardBroadcastRun {
("sign_coding_time", stats.sign_coding_elapsed, i64), ("sign_coding_time", stats.sign_coding_elapsed, i64),
("coding_send_time", stats.coding_send_elapsed, i64), ("coding_send_time", stats.coding_send_elapsed, i64),
); );
}
self.process_shreds_stats.reset(); self.process_shreds_stats.reset();
} }
} }