Separate out interrupted slots broadcast metrics (#20537)
This commit is contained in:
		| @@ -89,6 +89,7 @@ impl BroadcastRun for BroadcastFakeShredsRun { | |||||||
|             slot, |             slot, | ||||||
|             num_expected_batches: None, |             num_expected_batches: None, | ||||||
|             slot_start_ts: Instant::now(), |             slot_start_ts: Instant::now(), | ||||||
|  |             was_interrupted: false, | ||||||
|         }; |         }; | ||||||
|         // 3) Start broadcast step |         // 3) Start broadcast step | ||||||
|         //some indicates fake shreds |         //some indicates fake shreds | ||||||
|   | |||||||
| @@ -2,7 +2,7 @@ use super::*; | |||||||
|  |  | ||||||
| pub(crate) trait BroadcastStats { | pub(crate) trait BroadcastStats { | ||||||
|     fn update(&mut self, new_stats: &Self); |     fn update(&mut self, new_stats: &Self); | ||||||
|     fn report_stats(&mut self, slot: Slot, slot_start: Instant); |     fn report_stats(&mut self, slot: Slot, slot_start: Instant, was_interrupted: bool); | ||||||
| } | } | ||||||
|  |  | ||||||
| #[derive(Clone)] | #[derive(Clone)] | ||||||
| @@ -10,6 +10,7 @@ pub(crate) struct BroadcastShredBatchInfo { | |||||||
|     pub(crate) slot: Slot, |     pub(crate) slot: Slot, | ||||||
|     pub(crate) num_expected_batches: Option<usize>, |     pub(crate) num_expected_batches: Option<usize>, | ||||||
|     pub(crate) slot_start_ts: Instant, |     pub(crate) slot_start_ts: Instant, | ||||||
|  |     pub(crate) was_interrupted: bool, | ||||||
| } | } | ||||||
|  |  | ||||||
| #[derive(Default, Clone)] | #[derive(Default, Clone)] | ||||||
| @@ -33,25 +34,39 @@ impl BroadcastStats for TransmitShredsStats { | |||||||
|         self.total_packets += new_stats.total_packets; |         self.total_packets += new_stats.total_packets; | ||||||
|         self.dropped_packets += new_stats.dropped_packets; |         self.dropped_packets += new_stats.dropped_packets; | ||||||
|     } |     } | ||||||
|     fn report_stats(&mut self, slot: Slot, slot_start: Instant) { |     fn report_stats(&mut self, slot: Slot, slot_start: Instant, was_interrupted: bool) { | ||||||
|         datapoint_info!( |         if was_interrupted { | ||||||
|             "broadcast-transmit-shreds-stats", |             datapoint_info!( | ||||||
|             ("slot", slot as i64, i64), |                 "broadcast-transmit-shreds-interrupted-stats", | ||||||
|             ( |                 ("slot", slot as i64, i64), | ||||||
|                 "end_to_end_elapsed", |                 ("transmit_elapsed", self.transmit_elapsed as i64, i64), | ||||||
|                 // `slot_start` signals when the first batch of shreds was |                 ("send_mmsg_elapsed", self.send_mmsg_elapsed as i64, i64), | ||||||
|                 // received, used to measure duration of broadcast |                 ("get_peers_elapsed", self.get_peers_elapsed as i64, i64), | ||||||
|                 slot_start.elapsed().as_micros() as i64, |                 ("num_shreds", self.num_shreds as i64, i64), | ||||||
|                 i64 |                 ("shred_select", self.shred_select as i64, i64), | ||||||
|             ), |                 ("total_packets", self.total_packets as i64, i64), | ||||||
|             ("transmit_elapsed", self.transmit_elapsed as i64, i64), |                 ("dropped_packets", self.dropped_packets as i64, i64), | ||||||
|             ("send_mmsg_elapsed", self.send_mmsg_elapsed as i64, i64), |             ); | ||||||
|             ("get_peers_elapsed", self.get_peers_elapsed as i64, i64), |         } else { | ||||||
|             ("num_shreds", self.num_shreds as i64, i64), |             datapoint_info!( | ||||||
|             ("shred_select", self.shred_select as i64, i64), |                 "broadcast-transmit-shreds-stats", | ||||||
|             ("total_packets", self.total_packets as i64, i64), |                 ("slot", slot as i64, i64), | ||||||
|             ("dropped_packets", self.dropped_packets as i64, i64), |                 ( | ||||||
|         ); |                     "end_to_end_elapsed", | ||||||
|  |                     // `slot_start` signals when the first batch of shreds was | ||||||
|  |                     // received, used to measure duration of broadcast | ||||||
|  |                     slot_start.elapsed().as_micros() as i64, | ||||||
|  |                     i64 | ||||||
|  |                 ), | ||||||
|  |                 ("transmit_elapsed", self.transmit_elapsed as i64, i64), | ||||||
|  |                 ("send_mmsg_elapsed", self.send_mmsg_elapsed as i64, i64), | ||||||
|  |                 ("get_peers_elapsed", self.get_peers_elapsed as i64, i64), | ||||||
|  |                 ("num_shreds", self.num_shreds as i64, i64), | ||||||
|  |                 ("shred_select", self.shred_select as i64, i64), | ||||||
|  |                 ("total_packets", self.total_packets as i64, i64), | ||||||
|  |                 ("dropped_packets", self.dropped_packets as i64, i64), | ||||||
|  |             ); | ||||||
|  |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -65,24 +80,37 @@ impl BroadcastStats for InsertShredsStats { | |||||||
|         self.insert_shreds_elapsed += new_stats.insert_shreds_elapsed; |         self.insert_shreds_elapsed += new_stats.insert_shreds_elapsed; | ||||||
|         self.num_shreds += new_stats.num_shreds; |         self.num_shreds += new_stats.num_shreds; | ||||||
|     } |     } | ||||||
|     fn report_stats(&mut self, slot: Slot, slot_start: Instant) { |     fn report_stats(&mut self, slot: Slot, slot_start: Instant, was_interrupted: bool) { | ||||||
|         datapoint_info!( |         if was_interrupted { | ||||||
|             "broadcast-insert-shreds-stats", |             datapoint_info!( | ||||||
|             ("slot", slot as i64, i64), |                 "broadcast-insert-shreds-interrupted-stats", | ||||||
|             ( |                 ("slot", slot as i64, i64), | ||||||
|                 "end_to_end_elapsed", |                 ( | ||||||
|                 // `slot_start` signals when the first batch of shreds was |                     "insert_shreds_elapsed", | ||||||
|                 // received, used to measure duration of broadcast |                     self.insert_shreds_elapsed as i64, | ||||||
|                 slot_start.elapsed().as_micros() as i64, |                     i64 | ||||||
|                 i64 |                 ), | ||||||
|             ), |                 ("num_shreds", self.num_shreds as i64, i64), | ||||||
|             ( |             ); | ||||||
|                 "insert_shreds_elapsed", |         } else { | ||||||
|                 self.insert_shreds_elapsed as i64, |             datapoint_info!( | ||||||
|                 i64 |                 "broadcast-insert-shreds-stats", | ||||||
|             ), |                 ("slot", slot as i64, i64), | ||||||
|             ("num_shreds", self.num_shreds as i64, i64), |                 ( | ||||||
|         ); |                     "end_to_end_elapsed", | ||||||
|  |                     // `slot_start` signals when the first batch of shreds was | ||||||
|  |                     // received, used to measure duration of broadcast | ||||||
|  |                     slot_start.elapsed().as_micros() as i64, | ||||||
|  |                     i64 | ||||||
|  |                 ), | ||||||
|  |                 ( | ||||||
|  |                     "insert_shreds_elapsed", | ||||||
|  |                     self.insert_shreds_elapsed as i64, | ||||||
|  |                     i64 | ||||||
|  |                 ), | ||||||
|  |                 ("num_shreds", self.num_shreds as i64, i64), | ||||||
|  |             ); | ||||||
|  |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -128,9 +156,11 @@ impl<T: BroadcastStats + Default> SlotBroadcastStats<T> { | |||||||
|                 } |                 } | ||||||
|                 if let Some(num_expected_batches) = slot_batch_counter.num_expected_batches { |                 if let Some(num_expected_batches) = slot_batch_counter.num_expected_batches { | ||||||
|                     if slot_batch_counter.num_batches == num_expected_batches { |                     if slot_batch_counter.num_batches == num_expected_batches { | ||||||
|                         slot_batch_counter |                         slot_batch_counter.broadcast_shred_stats.report_stats( | ||||||
|                             .broadcast_shred_stats |                             batch_info.slot, | ||||||
|                             .report_stats(batch_info.slot, batch_info.slot_start_ts); |                             batch_info.slot_start_ts, | ||||||
|  |                             batch_info.was_interrupted, | ||||||
|  |                         ); | ||||||
|                         should_delete = true; |                         should_delete = true; | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
| @@ -159,7 +189,7 @@ mod test { | |||||||
|             self.count += new_stats.count; |             self.count += new_stats.count; | ||||||
|             self.sender = new_stats.sender.clone(); |             self.sender = new_stats.sender.clone(); | ||||||
|         } |         } | ||||||
|         fn report_stats(&mut self, slot: Slot, slot_start: Instant) { |         fn report_stats(&mut self, slot: Slot, slot_start: Instant, _was_interrupted: bool) { | ||||||
|             self.sender |             self.sender | ||||||
|                 .as_ref() |                 .as_ref() | ||||||
|                 .unwrap() |                 .unwrap() | ||||||
| @@ -186,6 +216,7 @@ mod test { | |||||||
|                 slot: 0, |                 slot: 0, | ||||||
|                 num_expected_batches: Some(2), |                 num_expected_batches: Some(2), | ||||||
|                 slot_start_ts: start, |                 slot_start_ts: start, | ||||||
|  |                 was_interrupted: false, | ||||||
|             }), |             }), | ||||||
|         ); |         ); | ||||||
|  |  | ||||||
| @@ -242,6 +273,7 @@ mod test { | |||||||
|                 slot: 0, |                 slot: 0, | ||||||
|                 num_expected_batches: None, |                 num_expected_batches: None, | ||||||
|                 slot_start_ts: start, |                 slot_start_ts: start, | ||||||
|  |                 was_interrupted: false, | ||||||
|             }), |             }), | ||||||
|         ); |         ); | ||||||
|  |  | ||||||
| @@ -265,6 +297,7 @@ mod test { | |||||||
|                         slot, |                         slot, | ||||||
|                         num_expected_batches: None, |                         num_expected_batches: None, | ||||||
|                         slot_start_ts: start, |                         slot_start_ts: start, | ||||||
|  |                         was_interrupted: false, | ||||||
|                     }; |                     }; | ||||||
|                     if i == round % num_threads { |                     if i == round % num_threads { | ||||||
|                         broadcast_batch_info.num_expected_batches = Some(num_threads); |                         broadcast_batch_info.num_expected_batches = Some(num_threads); | ||||||
|   | |||||||
| @@ -92,7 +92,7 @@ impl StandardBroadcastRun { | |||||||
|                     stats, |                     stats, | ||||||
|                 ); |                 ); | ||||||
|                 shreds.insert(0, shred); |                 shreds.insert(0, shred); | ||||||
|                 self.report_and_reset_stats(); |                 self.report_and_reset_stats(true); | ||||||
|                 self.unfinished_slot = None; |                 self.unfinished_slot = None; | ||||||
|                 shreds |                 shreds | ||||||
|             } |             } | ||||||
| @@ -240,6 +240,7 @@ impl StandardBroadcastRun { | |||||||
|                     "Old broadcast start time for previous slot must exist if the previous slot |                     "Old broadcast start time for previous slot must exist if the previous slot | ||||||
|                  was interrupted", |                  was interrupted", | ||||||
|                 ), |                 ), | ||||||
|  |                 was_interrupted: true, | ||||||
|             }); |             }); | ||||||
|             let shreds = Arc::new(prev_slot_shreds); |             let shreds = Arc::new(prev_slot_shreds); | ||||||
|             debug_assert!(shreds.iter().all(|shred| shred.slot() == slot)); |             debug_assert!(shreds.iter().all(|shred| shred.slot() == slot)); | ||||||
| @@ -262,6 +263,7 @@ impl StandardBroadcastRun { | |||||||
|             slot_start_ts: self |             slot_start_ts: self | ||||||
|                 .slot_broadcast_start |                 .slot_broadcast_start | ||||||
|                 .expect("Start timestamp must exist for a slot if we're broadcasting the slot"), |                 .expect("Start timestamp must exist for a slot if we're broadcasting the slot"), | ||||||
|  |             was_interrupted: false, | ||||||
|         }); |         }); | ||||||
|         get_leader_schedule_time.stop(); |         get_leader_schedule_time.stop(); | ||||||
|  |  | ||||||
| @@ -297,7 +299,7 @@ impl StandardBroadcastRun { | |||||||
|         self.process_shreds_stats.update(&process_stats); |         self.process_shreds_stats.update(&process_stats); | ||||||
|  |  | ||||||
|         if last_tick_height == bank.max_tick_height() { |         if last_tick_height == bank.max_tick_height() { | ||||||
|             self.report_and_reset_stats(); |             self.report_and_reset_stats(false); | ||||||
|             self.unfinished_slot = None; |             self.unfinished_slot = None; | ||||||
|         } |         } | ||||||
|  |  | ||||||
| @@ -380,35 +382,59 @@ impl StandardBroadcastRun { | |||||||
|         transmit_shreds_stats.update(new_transmit_shreds_stats, broadcast_shred_batch_info); |         transmit_shreds_stats.update(new_transmit_shreds_stats, broadcast_shred_batch_info); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn report_and_reset_stats(&mut self) { |     fn report_and_reset_stats(&mut self, was_interrupted: bool) { | ||||||
|         let stats = &self.process_shreds_stats; |         let stats = &self.process_shreds_stats; | ||||||
|         let unfinished_slot = self.unfinished_slot.as_ref().unwrap(); |         let unfinished_slot = self.unfinished_slot.as_ref().unwrap(); | ||||||
|         datapoint_info!( |         if was_interrupted { | ||||||
|             "broadcast-process-shreds-stats", |             datapoint_info!( | ||||||
|             ("slot", unfinished_slot.slot as i64, i64), |                 "broadcast-process-shreds-interrupted-stats", | ||||||
|             ("shredding_time", stats.shredding_elapsed, i64), |                 ("slot", unfinished_slot.slot as i64, i64), | ||||||
|             ("receive_time", stats.receive_elapsed, i64), |                 ("shredding_time", stats.shredding_elapsed, i64), | ||||||
|             ( |                 ("receive_time", stats.receive_elapsed, i64), | ||||||
|                 "num_data_shreds", |                 ( | ||||||
|                 unfinished_slot.next_shred_index as i64, |                     "num_data_shreds", | ||||||
|                 i64 |                     unfinished_slot.next_shred_index as i64, | ||||||
|             ), |                     i64 | ||||||
|             ( |                 ), | ||||||
|                 "slot_broadcast_time", |                 ( | ||||||
|                 self.slot_broadcast_start.unwrap().elapsed().as_micros() as i64, |                     "get_leader_schedule_time", | ||||||
|                 i64 |                     stats.get_leader_schedule_elapsed, | ||||||
|             ), |                     i64 | ||||||
|             ( |                 ), | ||||||
|                 "get_leader_schedule_time", |                 ("serialize_shreds_time", stats.serialize_elapsed, i64), | ||||||
|                 stats.get_leader_schedule_elapsed, |                 ("gen_data_time", stats.gen_data_elapsed, i64), | ||||||
|                 i64 |                 ("gen_coding_time", stats.gen_coding_elapsed, i64), | ||||||
|             ), |                 ("sign_coding_time", stats.sign_coding_elapsed, i64), | ||||||
|             ("serialize_shreds_time", stats.serialize_elapsed, i64), |                 ("coding_send_time", stats.coding_send_elapsed, i64), | ||||||
|             ("gen_data_time", stats.gen_data_elapsed, i64), |             ); | ||||||
|             ("gen_coding_time", stats.gen_coding_elapsed, i64), |         } else { | ||||||
|             ("sign_coding_time", stats.sign_coding_elapsed, i64), |             datapoint_info!( | ||||||
|             ("coding_send_time", stats.coding_send_elapsed, i64), |                 "broadcast-process-shreds-stats", | ||||||
|         ); |                 ("slot", unfinished_slot.slot as i64, i64), | ||||||
|  |                 ("shredding_time", stats.shredding_elapsed, i64), | ||||||
|  |                 ("receive_time", stats.receive_elapsed, i64), | ||||||
|  |                 ( | ||||||
|  |                     "num_data_shreds", | ||||||
|  |                     unfinished_slot.next_shred_index as i64, | ||||||
|  |                     i64 | ||||||
|  |                 ), | ||||||
|  |                 ( | ||||||
|  |                     "slot_broadcast_time", | ||||||
|  |                     self.slot_broadcast_start.unwrap().elapsed().as_micros() as i64, | ||||||
|  |                     i64 | ||||||
|  |                 ), | ||||||
|  |                 ( | ||||||
|  |                     "get_leader_schedule_time", | ||||||
|  |                     stats.get_leader_schedule_elapsed, | ||||||
|  |                     i64 | ||||||
|  |                 ), | ||||||
|  |                 ("serialize_shreds_time", stats.serialize_elapsed, i64), | ||||||
|  |                 ("gen_data_time", stats.gen_data_elapsed, i64), | ||||||
|  |                 ("gen_coding_time", stats.gen_coding_elapsed, i64), | ||||||
|  |                 ("sign_coding_time", stats.sign_coding_elapsed, i64), | ||||||
|  |                 ("coding_send_time", stats.coding_send_elapsed, i64), | ||||||
|  |             ); | ||||||
|  |         } | ||||||
|         self.process_shreds_stats.reset(); |         self.process_shreds_stats.reset(); | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user